morousg · albertandaluz · Mar 7, 2025 · Mar 7, 2025 · Mar 7, 2025 · Mar 7, 2025
diff --git a/.clang-format b/.clang-format
@@ -0,0 +1,234 @@
+---
+BasedOnStyle: LLVM
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: None
+AlignConsecutiveAssignments:
+  Enabled: false
+  AcrossEmptyLines: false
+  AcrossComments: false
+  AlignCompound: false
+  PadOperators: true
+AlignConsecutiveBitFields:
+  Enabled: false
+  AcrossEmptyLines: false
+  AcrossComments: false
+  AlignCompound: false
+  PadOperators: false
+AlignConsecutiveDeclarations:
+  Enabled: false
+  AcrossEmptyLines: false
+  AcrossComments: false
+  AlignCompound: false
+  PadOperators: false
+AlignConsecutiveMacros:
+  Enabled: false
+  AcrossEmptyLines: false
+  AcrossComments: false
+  AlignCompound: false
+  PadOperators: false
+AlignConsecutiveShortCaseStatements:
+  Enabled: false
+  AcrossEmptyLines: false
+  AcrossComments: false
+  AlignCaseColons: false
+AlignEscapedNewlines: Right
+AlignOperands: Align
+AlignTrailingComments:
+  Kind: Always
+  OverEmptyLines: 0
+AllowAllArgumentsOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortEnumsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: All
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: MultiLine
+AttributeMacros:
+  - __capability
+BinPackArguments: true
+BinPackParameters: true
+BitFieldColonSpacing: Both
+BraceWrapping:
+  AfterCaseLabel: false
+  AfterClass: false
+  AfterControlStatement: Never
+  AfterEnum: false
+  AfterFunction: false
+  AfterNamespace: false
+  AfterObjCDeclaration: false
+  AfterStruct: false
+  AfterUnion: false
+  AfterExternBlock: false
+  BeforeCatch: false
+  BeforeElse: false
+  BeforeLambdaBody: false
+  BeforeWhile: false
+  IndentBraces: false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakAfterAttributes: Never
+BreakAfterJavaFieldAnnotations: false
+BreakArrays: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeConceptDeclarations: Always
+BreakBeforeInlineASMColon: OnlyMultiline
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeColon
+BreakInheritanceList: BeforeColon
+BreakStringLiterals: true
+ColumnLimit: 120
+CommentPragmas: "^ IWYU pragma:"
+CompactNamespaces: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat: false
+EmptyLineAfterAccessModifier: Never
+EmptyLineBeforeAccessModifier: LogicalBlock
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IfMacros:
+  - KJ_IF_MAYBE
+IncludeBlocks: Preserve
+IncludeCategories:
+  - Regex: ^"(llvm|llvm-c|clang|clang-c)/
+    Priority: 2
+    SortPriority: 0
+    CaseSensitive: false
+  - Regex: ^(<|"(gtest|gmock|isl|json)/)
+    Priority: 3
+    SortPriority: 0
+    CaseSensitive: false
+  - Regex: .*
+    Priority: 1
+    SortPriority: 0
+    CaseSensitive: false
+IncludeIsMainRegex: (Test)?$
+IncludeIsMainSourceRegex: ""
+IndentAccessModifiers: false
+IndentCaseBlocks: false
+IndentCaseLabels: false
+IndentExternBlock: AfterExternBlock
+IndentGotoLabels: true
+IndentPPDirectives: None
+IndentRequiresClause: true
+IndentWidth: 2
+IndentWrappedFunctionNames: false
+InsertBraces: false
+InsertNewlineAtEOF: false
+InsertTrailingCommas: None
+IntegerLiteralSeparator:
+  Binary: 0
+  BinaryMinDigits: 0
+  Decimal: 0
+  DecimalMinDigits: 0
+  Hex: 0
+  HexMinDigits: 0
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtEOF: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+LambdaBodyIndentation: Signature
+Language: Cpp
+LineEnding: DeriveLF
+MacroBlockBegin: ""
+MacroBlockEnd: ""
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 2
+ObjCBreakBeforeNestedBlockParam: true
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PPIndentWidth: -1
+PackConstructorInitializers: BinPack
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakOpenParenthesis: 0
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyIndentedWhitespace: 0
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Right
+QualifierAlignment: Leave
+ReferenceAlignment: Pointer
+ReflowComments: true
+RemoveBracesLLVM: false
+RemoveParentheses: Leave
+RemoveSemicolon: false
+RequiresClausePosition: OwnLine
+RequiresExpressionIndentation: OuterScope
+SeparateDefinitionBlocks: Leave
+ShortNamespaceLines: 1
+SortIncludes: CaseSensitive
+SortJavaStaticImport: Before
+SortUsingDeclarations: LexicographicNumeric
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceAroundPointerQualifiers: Default
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCaseColon: false
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeJsonColon: false
+SpaceBeforeParens: ControlStatements
+SpaceBeforeParensOptions:
+  AfterControlStatements: true
+  AfterForeachMacros: true
+  AfterFunctionDeclarationName: false
+  AfterFunctionDefinitionName: false
+  AfterIfMacros: true
+  AfterOverloadedOperator: false
+  AfterRequiresInClause: false
+  AfterRequiresInExpression: false
+  BeforeNonEmptyParentheses: false
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceBeforeSquareBrackets: false
+SpaceInEmptyBlock: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: Never
+SpacesInContainerLiterals: true
+SpacesInLineCommentPrefix:
+  Minimum: 1
+  Maximum: -1
+SpacesInParens: Never
+SpacesInParensOptions:
+  InConditionalStatements: false
+  InCStyleCasts: false
+  InEmptyParentheses: false
+  Other: false
+SpacesInSquareBrackets: false
+Standard: Latest
+StatementAttributeLikeMacros:
+  - Q_EMIT
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth: 8
+UseTab: Never
+VerilogBreakBetweenInstancePorts: true
+WhitespaceSensitiveMacros:
+  - BOOST_PP_STRINGIZE
+  - CF_SWIFT_NAME
+  - NS_SWIFT_NAME
+  - PP_STRINGIZE
+  - STRINGIZE
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,39 @@
+
+cmake_minimum_required(VERSION 3.22 FATAL_ERROR)
+
+set (PROJECT_VERSION_MAJOR 0)
+set (PROJECT_VERSION_MINOR 1)
+set (PROJECT_VERSION_REV 0)
+set (PROJECT_VERSION ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_REV})
+include (cmake/cmake_init.cmake)
+include (cmake/doxygen.cmake)
+include (cmake/targets/virtualfolders.cmake)
+include (cmake/deploy/deploy_dependencies.cmake)
+include (cmake/libs/cuda/archs.cmake)
+include (cmake/generators/version_header.cmake)
+project(FusedKernelLibrary VERSION ${PROJECT_VERSION} LANGUAGES CXX CUDA 
+DESCRIPTION "Implementation of a methodology that allows all sorts of user defined GPU kernel fusion, for non CUDA programmers." 
+HOMEPAGE_URL "https://github.com/morousg/FusedKernelLibrary" )
+
+include (cmake/libs/cuda/cuda.cmake)
+include (cmake/cuda_init.cmake)
+include (cmake/discover_tests.cmake)
+
+option (BUILD_UTEST "build standard unit tests" ON)
+option (ENABLE_BENCHMARK "build benchmarking unit tests" OFF)
+
+add_subdirectory(include)
+add_subdirectory(lib)
+
+
+
+if (${BUILD_UTEST})
+    enable_testing()  
+    add_subdirectory(tests)    
+endif()
+
+
+if (${ENABLE_BENCHMARK})
+    enable_testing()
+    add_subdirectory(benchmarks)
+endif()
diff --git a/README.md b/README.md
@@ -0,0 +1,44 @@
+# Fused Kernel Library (FKL)
+
+This folder contains the FusedKernel library, which can be used independently of OpenCV, and compiled with nvcc.
+
+It is the building grounds of [cvGPUSpeedup](https://github.com/morousg/cvGPUSpeedup). In fact, cvGPUSpeedup is a wrapper arround FusedKernel, made to make it easy to use FusedKernel with OpenCV objects.
+
+The goal is to allow programmers that are used to OpenCV, to very easily and intuituvely be able to use the FusedKernel library.
+
+## Fusion and inclusion
+
+The way the FusedKernel library (FKL) is implemented, allows not only to use the already implemented Operations and data types like Ptr2D or Tensor, but also the fusion can be performend using any code that conforms to the FusedKernel interface (the DeviceFunction structs and the operate function types). The operations in FKL can use any data type that the user wants to use, basic types, structs, tuples (we implemented fk::Tuple to be used on GPU code, along with fk::apply and other utilites).
+
+This was done in purpose to make it easier to join efforts with other libraries that already exist and are also OpenSource and want to take advantage of the FusedKernel library strategy.
+### Horizontal Fusion
+
+This fusion technique is widely known and used. It is based on the idea of processing several data planes in parallel, with the same CUDA kernel. For that, we use the blockIdx.z, to distinguish between thread planes and data planes.
+
+This is usually very beneficial when each plane is very small, and the resulting 2D Grid is not taking advantage of the GPU memory bandwidth.
+
+We also support what we call Divergent Horizontal Fusion. This variant allows to execute different kernels that can be executed in parallel. Each "kernel" can use one or more z planes of the grid, so each kernel can do Horizontal Fusion. This technique allows to exploit the possibility of using diferent components in the SM's in parallel, improving the overall performance.
+
+### Generic Vertical Fusion
+
+Vertical Fusion is usually limited to having a kernel that is configurable up to a certain level, or there is a list of pre-compiled fused kernels to choose from. In our case, we are abstrating away the thread behavior from the actual functionality, and allowing to fuse almost every kernel possible, without having to rewrite neither the thread handling, nor the functionality. You only have to combine code in the different ways that the code can be combined. We call this Generic Vertical Fusion.
+
+For Memory Bound kernels, vertical fusion is bringing most of the performance improvements possible, since adding more functions to the kernel will not increase the execution time, up to a limit where the kernel becomes Compute Bound.
+
+Not only that, but thanks to the way the code is written, the nvcc compiler will treat the consecutive operations as if you where writting the code in one line, adding all sorts of optimizations. This can be seen by compiling the code in Release mode, or in Debug mode. The performance difference is abismal.
+
+### Backwards Generic Vertical Fusion (read and compute only what you need)
+
+This is an optimization that can already be used with the current code, but will be refined and further increase the use cases when addind more Operations.
+
+The idea, is aplicable for situations where you have a big plane, from which you will only use a subset of the data. If you need to transform that plane into something different before doing the operation that will read the subset of elements, you can use vertical fusion in order to have a single kernel, that will read only what it needs, and apply to it all the transformations needed.
+
+For example, let's assume that you receive an image in YUV420_NV12 format, and you need to crop a region of this image, then convert the pixels to RGB, then resize the crop, normalize it to floating point values from 0 to 1, and store the resulting image in RGB planar format. Usually, this would lead to many kernels, one after the other. The first kernel that converts to RGB, will convert the full image, and write the result to memory. Instead, with the Fused Kernel library, it is possible to create a Fused Kernel in a few lines, that will only read the YUV data for the pixels required by the interpolation process, in the resize of the crop. All the steps will be performed using GPU registers, until the last step where we will finally write into GPU ram memory.
+
+This is way faster than the conventional way of programming CUDA.
+
+## Closed source friendly
+
+A company that has it's own CUDA kernels, and wants to start fusing them along with operations present in this library, they can do so by shaping their kernels into a conformant FusedKernel Operation, that can be passed as a template parameter of one of the FKL DeviceFunction structs.
+
+With this strategy, they don't need to share any of their code. They just need to make their kernels fusionable.
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -0,0 +1,56 @@
+
+function (discover_benchmark DIR)
+    file(
+        GLOB_RECURSE
+        CUDA_SOURCES
+        CONFIGURE_DEPENDS
+        "${DIR}/*.cpp"
+        "${DIR}/*.cu"
+    )
+
+    foreach(cuda_source ${CUDA_SOURCES})
+        get_filename_component(cuda_target ${cuda_source} NAME_WE)    
+        add_executable(${cuda_target} ${cuda_source} ${LAUNCH_SOURCES}) 
+        add_test(NAME  ${cuda_target} COMMAND ${cuda_target})
+        target_link_libraries(${cuda_target} PRIVATE CUDA::nppc CUDA::nppial CUDA::nppidei CUDA::nppig headers ${PROJECT_NAME}::${PROJECT_NAME})        
+        set_property(TARGET ${cuda_target} PROPERTY FOLDER benchmarks/${DIR_NAME})
+        set_target_properties(${cuda_target} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO)  
+        target_include_directories(${cuda_target} PRIVATE "${CMAKE_SOURCE_DIR}")
+        set_target_cuda_arch_flags(${cuda_target})        
+        add_cuda_to_target(${cuda_target} "")
+
+        if(${ENABLE_DEBUG})
+            add_cuda_debug_support_to_target(${cuda_target})
+        endif()
+
+        if(${ENABLE_NVTX})
+            add_nvtx_support_to_target(${cuda_target})
+        endif()
+
+        if(${ENABLE_BENCHMARK})
+            target_compile_definitions(${cuda_target} PRIVATE ENABLE_BENCHMARK)
+        endif()
+
+
+    endforeach()
+endfunction()
+
+set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${OUT_DIR})
+
+MACRO(SUBDIRLIST result curdir)
+FILE(GLOB children ${curdir}/*) # 
+SET(dirlist "")
+FOREACH(child ${children})
+    IF(IS_DIRECTORY ${child}) 
+        LIST(APPEND dirlist ${child})
+    ENDIF()
+ENDFOREACH()
+SET(${result} ${dirlist} )
+ENDMACRO()
+
+set (LIST_OF_DIRS "")
+SUBDIRLIST(LIST_DIRS ${CMAKE_CURRENT_SOURCE_DIR})
+
+foreach(DIR ${LIST_DIRS})
+    discover_benchmark(${DIR})    
+endforeach()