From 4a205ac815374b31bdc4141f98f2479f95f78ffd Mon Sep 17 00:00:00 2001 From: Leon Oostrum Date: Fri, 26 Jul 2024 17:45:40 +0200 Subject: [PATCH] Add .clang-format --- .clang-format | 280 ++++++++++++++++++++++++++++++++++++++++ .pre-commit-config.yaml | 4 +- include/tcbf.h | 17 ++- src/main.cu | 23 ++-- src/prepare_a_matrix.cu | 92 +++++-------- src/tcbf.cu | 50 +++---- 6 files changed, 349 insertions(+), 117 deletions(-) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..1390c85 --- /dev/null +++ b/.clang-format @@ -0,0 +1,280 @@ +--- +Language: Cpp +# BasedOnStyle: Google +AccessModifierOffset: -1 +AlignAfterOpenBracket: Align +AlignArrayOfStructures: None +AlignConsecutiveAssignments: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: true +AlignConsecutiveBitFields: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveDeclarations: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveMacros: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveShortCaseStatements: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCaseColons: false +AlignEscapedNewlines: Left +AlignOperands: Align +AlignTrailingComments: + Kind: Always + OverEmptyLines: 0 +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowBreakBeforeNoexceptSpecifier: Never +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortCompoundRequirementOnASingleLine: true +AllowShortEnumsOnASingleLine: true +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: WithoutElse +AllowShortLambdasOnASingleLine: All +AllowShortLoopsOnASingleLine: true +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: Yes +AttributeMacros: + - __capability +BinPackArguments: true +BinPackParameters: true +BitFieldColonSpacing: Both +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: Never + AfterEnum: false + AfterExternBlock: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakAdjacentStringLiterals: true +BreakAfterAttributes: Leave +BreakAfterJavaFieldAnnotations: false +BreakArrays: true +BreakBeforeBinaryOperators: None +BreakBeforeConceptDeclarations: Always +BreakBeforeBraces: Attach +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeColon +BreakInheritanceList: BeforeColon +BreakStringLiterals: true +ColumnLimit: 120 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: true +DisableFormat: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: LogicalBlock +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IfMacros: + - KJ_IF_MAYBE +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '^<.*\.h>' + Priority: 1 + SortPriority: 0 + CaseSensitive: false + - Regex: '^<.*' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '.*' + Priority: 3 + SortPriority: 0 + CaseSensitive: false +IncludeIsMainRegex: '([-_](test|unittest))?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseBlocks: false +IndentCaseLabels: true +IndentExternBlock: AfterExternBlock +IndentGotoLabels: true +IndentPPDirectives: None +IndentRequiresClause: true +IndentWidth: 2 +IndentWrappedFunctionNames: false +InsertBraces: false +InsertNewlineAtEOF: false +InsertTrailingCommas: None +IntegerLiteralSeparator: + Binary: 0 + BinaryMinDigits: 0 + Decimal: 0 + DecimalMinDigits: 0 + Hex: 0 + HexMinDigits: 0 +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +KeepEmptyLinesAtEOF: false +LambdaBodyIndentation: Signature +LineEnding: DeriveLF +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Never +ObjCBlockIndentWidth: 2 +ObjCBreakBeforeNestedBlockParam: true +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PackConstructorInitializers: NextLine +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakOpenParenthesis: 0 +PenaltyBreakScopeResolution: 500 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyIndentedWhitespace: 0 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +PPIndentWidth: -1 +QualifierAlignment: Leave +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' + BasedOnStyle: google + - Language: TextProto + Delimiters: + - pb + - PB + - proto + - PROTO + EnclosingFunctions: + - EqualsProto + - EquivToProto + - PARSE_PARTIAL_TEXT_PROTO + - PARSE_TEST_PROTO + - PARSE_TEXT_PROTO + - ParseTextOrDie + - ParseTextProtoOrDie + - ParseTestProto + - ParsePartialTestProto + CanonicalDelimiter: pb + BasedOnStyle: google +ReferenceAlignment: Pointer +ReflowComments: true +RemoveBracesLLVM: false +RemoveParentheses: Leave +RemoveSemicolon: false +RequiresClausePosition: OwnLine +RequiresExpressionIndentation: OuterScope +SeparateDefinitionBlocks: Leave +ShortNamespaceLines: 1 +SkipMacroDefinitionBody: false +SortIncludes: CaseSensitive +SortJavaStaticImport: Before +SortUsingDeclarations: LexicographicNumeric +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceAroundPointerQualifiers: Default +SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeJsonColon: false +SpaceBeforeParens: ControlStatements +SpaceBeforeParensOptions: + AfterControlStatements: true + AfterForeachMacros: true + AfterFunctionDefinitionName: false + AfterFunctionDeclarationName: false + AfterIfMacros: true + AfterOverloadedOperator: false + AfterPlacementOperator: true + AfterRequiresInClause: false + AfterRequiresInExpression: false + BeforeNonEmptyParentheses: false +SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false +SpaceInEmptyBlock: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: Never +SpacesInContainerLiterals: true +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParens: Never +SpacesInParensOptions: + InCStyleCasts: false + InConditionalStatements: false + InEmptyParentheses: false + Other: false +SpacesInSquareBrackets: false +Standard: Auto +StatementAttributeLikeMacros: + - Q_EMIT +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseTab: Never +VerilogBreakBetweenInstancePorts: true +WhitespaceSensitiveMacros: + - BOOST_PP_STRINGIZE + - CF_SWIFT_NAME + - NS_SWIFT_NAME + - PP_STRINGIZE + - STRINGIZE +... + diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7eceb16..1f03ee0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,10 +1,10 @@ repos: - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v17.0.6 + rev: v18.1.8 hooks: - id: clang-format - repo: https://github.com/cheshirekow/cmake-format-precommit rev: v0.6.13 hooks: - id: cmake-format - - id: cmake-lint \ No newline at end of file + - id: cmake-lint diff --git a/include/tcbf.h b/include/tcbf.h index 790b763..fce00d9 100644 --- a/include/tcbf.h +++ b/include/tcbf.h @@ -1,16 +1,16 @@ #ifndef TCBF_H_ #define TCBF_H_ +#include + #include #include -#include namespace tcbf { class Beamformer { -public: - Beamformer(const size_t pixels, const size_t frames, const size_t samples, - cu::Device &device, cu::Stream &stream); + public: + Beamformer(const size_t pixels, const size_t frames, const size_t samples, cu::Device &device, cu::Stream &stream); // static void prepare_A_matrix(const std::string input_path, const // std::string output_path, const size_t pixels, const size_t samples, @@ -20,7 +20,7 @@ class Beamformer { void write_BF(cu::HostMemory &BF, const std::string path); void process(cu::HostMemory &RF, cu::HostMemory &BF); -private: + private: void RF_to_device(cu::HostMemory &RF); void BF_to_host(cu::HostMemory &BF); @@ -28,8 +28,7 @@ class Beamformer { static const size_t kBatchSize{1}; static const ccglib::mma::Precision kGEMMPrecision{ccglib::mma::int1}; static const ccglib::mma::Variant kGEMMVariant{ccglib::mma::opt}; - const dim3 kGEMMTileSize{ - ccglib::mma::GEMM::GetDimensions(kGEMMPrecision, kGEMMVariant)}; + const dim3 kGEMMTileSize{ccglib::mma::GEMM::GetDimensions(kGEMMPrecision, kGEMMVariant)}; std::unique_ptr d_A; std::unique_ptr d_RF; @@ -58,6 +57,6 @@ class Beamformer { cu::Stream &stream_; }; -} // namespace tcbf +} // namespace tcbf -#endif // TCBF_H_ \ No newline at end of file +#endif // TCBF_H_ \ No newline at end of file diff --git a/src/main.cu b/src/main.cu index bd8b1ba..d84bbd7 100644 --- a/src/main.cu +++ b/src/main.cu @@ -1,22 +1,18 @@ +#include + #include #include #include -#include - cxxopts::Options create_commandline_parser(const char *argv[]) { cxxopts::Options options(argv[0], "Echoframe standalone beamformer"); - options.add_options()("a_matrix", "Full path to input prepared A matrix", - cxxopts::value())( - "rf", "Full path to input RF", cxxopts::value())( - "bf", "Full path to output BF", cxxopts::value())( - "pixels", "Number of pixels", cxxopts::value())( - "frames", "Number of frames", cxxopts::value())( + options.add_options()("a_matrix", "Full path to input prepared A matrix", cxxopts::value())( + "rf", "Full path to input RF", cxxopts::value())("bf", "Full path to output BF", + cxxopts::value())( + "pixels", "Number of pixels", cxxopts::value())("frames", "Number of frames", cxxopts::value())( "samples", "Number of samples", cxxopts::value())( - "device", "GPU device ID", - cxxopts::value()->default_value(std::to_string(0)))( - "h,help", "Print help"); + "device", "GPU device ID", cxxopts::value()->default_value(std::to_string(0)))("h,help", "Print help"); return options; } @@ -32,8 +28,7 @@ cxxopts::ParseResult parse_commandline(int argc, const char *argv[]) { exit(EXIT_SUCCESS); } - std::vector required_options{"a_matrix", "rf", "bf", - "pixels", "frames", "samples"}; + std::vector required_options{"a_matrix", "rf", "bf", "pixels", "frames", "samples"}; for (auto &opt : required_options) { if (!result.count(opt)) { std::cerr << "Required argument missing: " << opt << std::endl; @@ -72,4 +67,4 @@ int main(int argc, const char *argv[]) { beamformer.read_RF(RF, path_rf); beamformer.process(RF, BF); beamformer.write_BF(BF, path_bf); -} \ No newline at end of file +} diff --git a/src/prepare_a_matrix.cu b/src/prepare_a_matrix.cu index fa808af..c2e4ecf 100644 --- a/src/prepare_a_matrix.cu +++ b/src/prepare_a_matrix.cu @@ -1,12 +1,12 @@ +#include #include -#include -#include -#include #include +#include #include -#include -#include +#include +#include +#include __global__ void conjugate_1bit(unsigned *data, const size_t n_bytes) { size_t tid = threadIdx.x + blockDim.x * static_cast(blockIdx.x); @@ -17,23 +17,15 @@ __global__ void conjugate_1bit(unsigned *data, const size_t n_bytes) { data[tid] = ~data[tid]; } -inline size_t align(size_t a, size_t b) { - return b * ccglib::helper::ceildiv(a, b); -} +inline size_t align(size_t a, size_t b) { return b * ccglib::helper::ceildiv(a, b); } cxxopts::Options create_commandline_parser(const char *argv[]) { - cxxopts::Options options(argv[0], - "Echoframe beamformer A matrix preparation"); - - options.add_options()("a_matrix_in", "Full path to input A matrix", - cxxopts::value())( - "a_matrix_out", "Full path to output A matrix", - cxxopts::value())("pixels", "Number of pixels", - cxxopts::value())( - "samples", "Number of samples", cxxopts::value())( - "device", "GPU device ID", - cxxopts::value()->default_value(std::to_string(0)))( - "h,help", "Print help"); + cxxopts::Options options(argv[0], "Echoframe beamformer A matrix preparation"); + + options.add_options()("a_matrix_in", "Full path to input A matrix", cxxopts::value())( + "a_matrix_out", "Full path to output A matrix", cxxopts::value())( + "pixels", "Number of pixels", cxxopts::value())("samples", "Number of samples", cxxopts::value())( + "device", "GPU device ID", cxxopts::value()->default_value(std::to_string(0)))("h,help", "Print help"); return options; } @@ -49,8 +41,7 @@ cxxopts::ParseResult parse_commandline(int argc, const char *argv[]) { exit(EXIT_SUCCESS); } - std::vector required_options{"a_matrix_in", "a_matrix_out", - "pixels", "samples"}; + std::vector required_options{"a_matrix_in", "a_matrix_out", "pixels", "samples"}; for (auto &opt : required_options) { if (!result.count(opt)) { std::cerr << "Required argument missing: " << opt << std::endl; @@ -69,8 +60,7 @@ cxxopts::ParseResult parse_commandline(int argc, const char *argv[]) { int main(int argc, const char *argv[]) { cxxopts::ParseResult cmdline = parse_commandline(argc, argv); const std::string path_a_matrix_in = cmdline["a_matrix_in"].as(); - const std::string path_a_matrix_out = - cmdline["a_matrix_out"].as(); + const std::string path_a_matrix_out = cmdline["a_matrix_out"].as(); const size_t pixels = cmdline["pixels"].as(); const size_t samples = cmdline["samples"].as(); const unsigned device_id = cmdline["device"].as(); @@ -81,8 +71,7 @@ int main(int argc, const char *argv[]) { cu::Stream stream; // tile size in beams, frames, samples axes - dim3 tile_sizes = - ccglib::mma::GEMM::GetDimensions(ccglib::mma::int1, ccglib::mma::opt); + dim3 tile_sizes = ccglib::mma::GEMM::GetDimensions(ccglib::mma::int1, ccglib::mma::opt); const size_t pixels_padded = align(pixels, tile_sizes.x); const size_t samples_padded = align(samples, tile_sizes.z); @@ -90,8 +79,7 @@ int main(int argc, const char *argv[]) { // factor 2 for complex // host is unpadded, device is always padded const size_t bytes_a_matrix = 2UL * pixels * samples; - const size_t bytes_a_matrix_packed = - 2UL * pixels_padded * samples_padded / CHAR_BIT; + const size_t bytes_a_matrix_packed = 2UL * pixels_padded * samples_padded / CHAR_BIT; // Read data from disk cu::HostMemory a_matrix_host(bytes_a_matrix); @@ -126,11 +114,9 @@ int main(int argc, const char *argv[]) { std::cout << "Packing" << std::endl; for (size_t c = 0; c < 2; c++) { const size_t complex_offset_host = c * pixels * samples; - const size_t complex_offset_device_packed = - c * pixels_padded * samples_padded / CHAR_BIT; + const size_t complex_offset_device_packed = c * pixels_padded * samples_padded / CHAR_BIT; // process chunks - for (size_t pixel_start = 0; pixel_start < pixels; - pixel_start += pixels_per_chunk) { + for (size_t pixel_start = 0; pixel_start < pixels; pixel_start += pixels_per_chunk) { size_t local_npixels = pixels_per_chunk; // correct npixels in last chunk if (pixel_start + local_npixels > pixels) { @@ -141,27 +127,19 @@ int main(int argc, const char *argv[]) { // copy chunk to device, row-by-row to handle padding for (size_t pixel = 0; pixel < local_npixels; pixel++) { const size_t d_offset = pixel * samples_padded; - const size_t offset = - (pixel_start + pixel) * samples + complex_offset_host; + const size_t offset = (pixel_start + pixel) * samples + complex_offset_host; const size_t bytes_to_transfer = samples; - cu::DeviceMemory d_a_chunk_slice(d_a_chunk, d_offset, - bytes_to_transfer); - stream.memcpyHtoDAsync(d_a_chunk_slice, - static_cast(a_matrix_host) + offset, - bytes_to_transfer); + cu::DeviceMemory d_a_chunk_slice(d_a_chunk, d_offset, bytes_to_transfer); + stream.memcpyHtoDAsync(d_a_chunk_slice, static_cast(a_matrix_host) + offset, bytes_to_transfer); } // get offset for this chunk in a_packed - cu::DeviceMemory d_a_packed_chunk( - d_a_matrix_packed, - pixel_start * samples_padded / CHAR_BIT + - complex_offset_device_packed, - local_npixels * samples_padded / CHAR_BIT); + cu::DeviceMemory d_a_packed_chunk(d_a_matrix_packed, + pixel_start * samples_padded / CHAR_BIT + complex_offset_device_packed, + local_npixels * samples_padded / CHAR_BIT); // run packing kernel - ccglib::packing::Packing packing(local_npixels * samples_padded, device, - stream); - packing.Run(d_a_chunk, d_a_packed_chunk, ccglib::packing::pack, - ccglib::packing::complex_first); + ccglib::packing::Packing packing(local_npixels * samples_padded, device, stream); + packing.Run(d_a_chunk, d_a_packed_chunk, ccglib::packing::pack, ccglib::packing::complex_first); } } @@ -169,33 +147,27 @@ int main(int argc, const char *argv[]) { std::cout << "Conjugate" << std::endl; dim3 threads(256); dim3 grid(ccglib::helper::ceildiv(bytes_a_matrix_packed / 2, threads.x)); - cu::DeviceMemory d_a_matrix_packed_imag( - d_a_matrix_packed, bytes_a_matrix_packed / 2, bytes_a_matrix_packed / 2); + cu::DeviceMemory d_a_matrix_packed_imag(d_a_matrix_packed, bytes_a_matrix_packed / 2, bytes_a_matrix_packed / 2); conjugate_1bit<<>>( - reinterpret_cast( - static_cast(d_a_matrix_packed_imag)), - bytes_a_matrix_packed / 2); + reinterpret_cast(static_cast(d_a_matrix_packed_imag)), bytes_a_matrix_packed / 2); // transpose std::cout << "Transpose" << std::endl; - ccglib::transpose::Transpose transpose(1, pixels_padded, samples_padded, - tile_sizes.x, tile_sizes.z, 1, device, + ccglib::transpose::Transpose transpose(1, pixels_padded, samples_padded, tile_sizes.x, tile_sizes.z, 1, device, stream); transpose.Run(d_a_matrix_packed, d_a_transposed); // copy output to host std::cout << "Copy to host" << std::endl; cu::HostMemory a_matrix_output(bytes_a_matrix_packed); - stream.memcpyDtoHAsync(a_matrix_output, d_a_transposed, - bytes_a_matrix_packed); + stream.memcpyDtoHAsync(a_matrix_output, d_a_transposed, bytes_a_matrix_packed); stream.synchronize(); // write to disk std::cout << "Write to disk" << std::endl; std::ofstream out(path_a_matrix_out, std::ios::binary | std::ios::out); if (!out) { - throw std::runtime_error("Failed to open output file: " + - path_a_matrix_out); + throw std::runtime_error("Failed to open output file: " + path_a_matrix_out); } out.write(static_cast(a_matrix_output), bytes_a_matrix_packed); -} \ No newline at end of file +} diff --git a/src/tcbf.cu b/src/tcbf.cu index 59bf522..41104b3 100644 --- a/src/tcbf.cu +++ b/src/tcbf.cu @@ -1,26 +1,22 @@ +#include + #include #include #include "tcbf.h" -#include #ifndef COMPLEX #define COMPLEX 2 #endif -static inline size_t align(size_t a, size_t b) { - return b * ccglib::helper::ceildiv(a, b); -} +static inline size_t align(size_t a, size_t b) { return b * ccglib::helper::ceildiv(a, b); } namespace tcbf { -Beamformer::Beamformer(const size_t pixels, const size_t frames, - const size_t samples, cu::Device &device, +Beamformer::Beamformer(const size_t pixels, const size_t frames, const size_t samples, cu::Device &device, cu::Stream &stream) - : pixels_(pixels), frames_(frames), samples_(samples), device_(device), - stream_(stream) { + : pixels_(pixels), frames_(frames), samples_(samples), device_(device), stream_(stream) { // padded sizes - dim3 tile_sizes = - ccglib::mma::GEMM::GetDimensions(kGEMMPrecision, kGEMMVariant); + dim3 tile_sizes = ccglib::mma::GEMM::GetDimensions(kGEMMPrecision, kGEMMVariant); pixels_padded_ = align(pixels, tile_sizes.x); frames_padded_ = align(frames, tile_sizes.y); samples_padded_ = align(samples, tile_sizes.z); @@ -36,15 +32,12 @@ Beamformer::Beamformer(const size_t pixels, const size_t frames, d_RF_transposed = std::make_unique(bytesRFPacked_); d_BF = std::make_unique(bytesBF_); // create objects to run kernels - pack_rf_ = std::make_unique( - COMPLEX * frames_padded_ * samples_padded_, device_, stream_); + pack_rf_ = std::make_unique(COMPLEX * frames_padded_ * samples_padded_, device_, stream_); transpose_rf_ = std::make_unique( - kBatchSize, frames_padded_, samples_padded_, kGEMMTileSize.y, - kGEMMTileSize.z, kBitsPerSample, device_, stream_); - gemm_ = std::make_unique( - kBatchSize, pixels_padded_, frames_padded_, samples_padded_, - kBitsPerSample, device_, stream_, kGEMMPrecision, kGEMMVariant, - ccglib::mma::col_major); + kBatchSize, frames_padded_, samples_padded_, kGEMMTileSize.y, kGEMMTileSize.z, kBitsPerSample, device_, stream_); + gemm_ = + std::make_unique(kBatchSize, pixels_padded_, frames_padded_, samples_padded_, kBitsPerSample, + device_, stream_, kGEMMPrecision, kGEMMVariant, ccglib::mma::col_major); } void Beamformer::read_A_matrix(const std::string path) { @@ -72,8 +65,7 @@ void Beamformer::write_BF(cu::HostMemory &BF, const std::string path) { if (!out) { throw std::runtime_error("Failed to open output file: " + path); } - out.write(static_cast(BF), - COMPLEX * frames_ * pixels_ * sizeof(unsigned)); + out.write(static_cast(BF), COMPLEX * frames_ * pixels_ * sizeof(unsigned)); } void Beamformer::RF_to_device(cu::HostMemory &RF) { @@ -86,8 +78,7 @@ void Beamformer::RF_to_device(cu::HostMemory &RF) { const size_t offset = f * samples_ * 2; const size_t bytes_to_transfer = samples_ * 2; cu::DeviceMemory d_RF_chunk(*d_RF, d_offset, bytes_to_transfer); - stream_.memcpyHtoDAsync(d_RF_chunk, static_cast(RF) + offset, - bytes_to_transfer); + stream_.memcpyHtoDAsync(d_RF_chunk, static_cast(RF) + offset, bytes_to_transfer); } } @@ -97,15 +88,11 @@ void Beamformer::BF_to_host(cu::HostMemory &BF) { for (size_t c = 0; c < COMPLEX; c++) { for (size_t f = 0; f < frames_; f++) { // get objects pointing to start of chunk to transfer - const size_t d_offset = - (c * frames_padded_ * pixels_padded_ + f * pixels_padded_) * - sizeof(unsigned); - const size_t offset = - (c * frames_ * pixels_ + f * pixels_) * sizeof(unsigned); + const size_t d_offset = (c * frames_padded_ * pixels_padded_ + f * pixels_padded_) * sizeof(unsigned); + const size_t offset = (c * frames_ * pixels_ + f * pixels_) * sizeof(unsigned); const size_t bytes_to_transfer = pixels_ * sizeof(unsigned); cu::DeviceMemory d_BF_chunk(*d_BF, d_offset, bytes_to_transfer); - stream_.memcpyDtoHAsync(static_cast(BF) + offset, d_BF_chunk, - bytes_to_transfer); + stream_.memcpyDtoHAsync(static_cast(BF) + offset, d_BF_chunk, bytes_to_transfer); } } } @@ -114,8 +101,7 @@ void Beamformer::process(cu::HostMemory &RF, cu::HostMemory &BF) { // transfer RF to GPU RF_to_device(RF); // pack bits - pack_rf_->Run(*d_RF, *d_RF_packed, ccglib::packing::pack, - ccglib::packing::complex_last); + pack_rf_->Run(*d_RF, *d_RF_packed, ccglib::packing::pack, ccglib::packing::complex_last); // transpose to format required by GEMM transpose_rf_->Run(*d_RF_packed, *d_RF_transposed); // do GEMM @@ -125,4 +111,4 @@ void Beamformer::process(cu::HostMemory &RF, cu::HostMemory &BF) { stream_.synchronize(); } -} // namespace tcbf +} // namespace tcbf