diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000000..b721f806e4f --- /dev/null +++ b/.clang-format @@ -0,0 +1,178 @@ +--- +Language: Cpp +# BasedOnStyle: LLVM +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignArrayOfStructures: None +AlignConsecutiveMacros: None +AlignConsecutiveAssignments: None +AlignConsecutiveBitFields: None +AlignConsecutiveDeclarations: None +AlignEscapedNewlines: Right +AlignOperands: Align +AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortEnumsOnASingleLine: true +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: Never +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: MultiLine +AttributeMacros: + - __capability +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: Never + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeConceptDeclarations: true +BreakBeforeBraces: Attach +BreakBeforeInheritanceComma: false +BreakInheritanceList: BeforeColon +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DeriveLineEnding: true +DerivePointerAlignment: false +DisableFormat: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: LogicalBlock +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IfMacros: + - KJ_IF_MAYBE +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '^(<|"(gtest|gmock|isl|json)/)' + Priority: 3 + SortPriority: 0 + CaseSensitive: false + - Regex: '.*' + Priority: 1 + SortPriority: 0 + CaseSensitive: false +IncludeIsMainRegex: '(Test)?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseLabels: false +IndentCaseBlocks: false +IndentGotoLabels: true +IndentPPDirectives: None +IndentExternBlock: AfterExternBlock +IndentRequires: false +IndentWidth: 2 +IndentWrappedFunctionNames: false +InsertTrailingCommas: None +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: true +LambdaBodyIndentation: Signature +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 2 +ObjCBreakBeforeNestedBlockParam: true +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 +PenaltyIndentedWhitespace: 0 +PointerAlignment: Right +PPIndentWidth: -1 +ReferenceAlignment: Pointer +ReflowComments: false +ShortNamespaceLines: 1 +SortIncludes: CaseSensitive +SortJavaStaticImport: Before +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceAroundPointerQualifiers: Default +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: Never +SpacesInConditionalStatement: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +BitFieldColonSpacing: Both +Standard: Latest +StatementAttributeLikeMacros: + - Q_EMIT +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseCRLF: false +UseTab: Never +WhitespaceSensitiveMacros: + - STRINGIZE + - PP_STRINGIZE + - BOOST_PP_STRINGIZE + - NS_SWIFT_NAME + - CF_SWIFT_NAME +... + diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 52d81d02172..c566a3dc630 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,11 +5,13 @@ exclude: '^tools/(build_utils/fypp)' fail_fast: false repos: - repo: https://github.com/ambv/black - rev: 21.5b1 + rev: 22.1.0 hooks: - id: black + name: Reformat Python files with the black code formatter + files: '^.*(/PACKAGE)|(\.py)$' - repo: https://gitlab.com/pycqa/flake8 - rev: 3.8.4 + rev: 4.0.1 hooks: - id: flake8 exclude: >- @@ -17,10 +19,12 @@ repos: .cp2k/.*| )$ - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.4.0 + rev: v4.1.0 hooks: - id: check-ast - id: check-yaml + - id: check-symlinks + - id: trailing-whitespace - repo: https://github.com/pseewald/fprettify rev: v0.3.7 hooks: diff --git a/LICENSE b/LICENSE index c4dec7ebadc..e648aa59503 100644 --- a/LICENSE +++ b/LICENSE @@ -291,7 +291,7 @@ convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. DBCSR: Distributed Block Compressed Sparse Row matrix library - Copyright (C) by the DBCSR developers group - All rights reserved + Copyright (C) by the DBCSR developers group - All rights reserved This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/docs/guide/3-developer-guide/1-tooling/index.md b/docs/guide/3-developer-guide/1-tooling/index.md index 8470abe8b65..bf113b8966e 100644 --- a/docs/guide/3-developer-guide/1-tooling/index.md +++ b/docs/guide/3-developer-guide/1-tooling/index.md @@ -2,7 +2,7 @@ title: Tooling # Build System -We support CMake for compilation. See [here](../../2-user-guide/1-installation/index.html) on how to compile and +We support CMake for compilation. See [here](../../2-user-guide/1-installation/index.html) on how to compile and [here](../../2-user-guide/1-installation/1-cmake-build-recipes.html) for more CMake details. Compilations is based on [Fypp](https://github.com/aradi/fypp) meta-progamming package, which is available as submodule. diff --git a/docs/guide/3-developer-guide/4-performance/1-insights.md b/docs/guide/3-developer-guide/4-performance/1-insights.md index fc30fd1615c..7847c782ad8 100644 --- a/docs/guide/3-developer-guide/4-performance/1-insights.md +++ b/docs/guide/3-developer-guide/4-performance/1-insights.md @@ -102,7 +102,7 @@ The columns describe: - `AVERAGE`: averaged over all MPI ranks - `MAXIMUM`: maximum over all MPI ranks - `AVERAGE` and `MAXIMUM` can be used to locate load-imbalance or synchronization points. -- `MAXRANKS`: +- `MAXRANKS`: #### Time spent in Just-In-Time (JIT) Compilation diff --git a/src/acc/libsmm_acc/predict/README.md b/src/acc/libsmm_acc/predict/README.md index f3cee3c785a..8f3bf3ad727 100644 --- a/src/acc/libsmm_acc/predict/README.md +++ b/src/acc/libsmm_acc/predict/README.md @@ -96,7 +96,7 @@ Explore the data interactively using the [provided Jupyter notebook](notebooks/i #### 4. Train -For each algorithm, build a predictive model using decision trees and feature selection based on the features' permutation importance. +For each algorithm, build a predictive model using decision trees and feature selection based on the features' permutation importance. ```bash ./predict_train.py # --algo medium --folder /scratch/autotuning_dataset, e.g. diff --git a/src/acc/libsmm_acc/predict/predict_helpers.py b/src/acc/libsmm_acc/predict/predict_helpers.py index 1ff82f7a887..a67de2081fc 100644 --- a/src/acc/libsmm_acc/predict/predict_helpers.py +++ b/src/acc/libsmm_acc/predict/predict_helpers.py @@ -29,7 +29,7 @@ def safe_pickle(data, file): :param data: data to be pickled :param file: file to pickle it into """ - max_bytes = 2 ** 31 - 1 # Maximum number of bytes to write in one chunk + max_bytes = 2**31 - 1 # Maximum number of bytes to write in one chunk pickle_out = pickle.dumps(data) n_bytes = len(pickle_out) with open(file, "wb") as f: @@ -47,7 +47,7 @@ def safe_pickle_load(file_path): :param data: data to be loaded through pickle :param file: file to read from """ - max_bytes = 2 ** 31 - 1 # Maximum number of bytes to read in one chunk + max_bytes = 2**31 - 1 # Maximum number of bytes to read in one chunk bytes_in = bytearray(0) input_size = os.path.getsize(file_path) with open(file_path, "rb") as f: diff --git a/src/acc/libsmm_acc/predict/predict_train.py b/src/acc/libsmm_acc/predict/predict_train.py index f752f4d528f..1e6a751bdda 100755 --- a/src/acc/libsmm_acc/predict/predict_train.py +++ b/src/acc/libsmm_acc/predict/predict_train.py @@ -489,7 +489,7 @@ def read_data(algo, read_from, nrows, folder, log): X = X.drop(cols_to_drop, axis=1) log += print_and_log( "X : {:>8,} x {:>8,} ({:>2.2} MB)".format( - len(X), len(X.columns), sys.getsizeof(X) / 10 ** 6 + len(X), len(X.columns), sys.getsizeof(X) / 10**6 ) ) log += print_and_log("Head:") @@ -505,7 +505,7 @@ def read_data(algo, read_from, nrows, folder, log): log += print_and_log("\nRead Y") Y = dd.read_parquet(parquet_data_file, columns=["perf_scaled"]) log += print_and_log( - "Y : {:>8,} ({:>2.2} MB)".format(len(Y), sys.getsizeof(Y) / 10 ** 6) + "Y : {:>8,} ({:>2.2} MB)".format(len(Y), sys.getsizeof(Y) / 10**6) ) log += print_and_log("Head:") log += print_and_log(Y.head()) @@ -516,7 +516,7 @@ def read_data(algo, read_from, nrows, folder, log): X_mnk = dd.read_parquet(parquet_data_file, columns=["mnk"]) nrows_data = len(X_mnk.index) log += print_and_log( - "X_mnk : {:>8,} ({:>2.2} MB)".format(nrows_data, sys.getsizeof(X_mnk) / 10 ** 6) + "X_mnk : {:>8,} ({:>2.2} MB)".format(nrows_data, sys.getsizeof(X_mnk) / 10**6) ) log += print_and_log("Head:") log += print_and_log(X_mnk.head()) diff --git a/src/acc/libsmm_acc/tune/README.md b/src/acc/libsmm_acc/tune/README.md index bcb8b0bf9b1..d3d0b075ed0 100644 --- a/src/acc/libsmm_acc/tune/README.md +++ b/src/acc/libsmm_acc/tune/README.md @@ -160,7 +160,7 @@ Each tune-directory contains a job file. Since there might be many tune-director When `tune_submit.py` is called without arguments, it will just list the jobs that could be submitted: ```bash -$ ./tune_submit.py +$ ./tune_submit.py tune_5x5x5: Would submit, run with "doit!" tune_5x5x8: Would submit, run with "doit!" tune_5x8x5: Would submit, run with "doit!" diff --git a/src/data/dbcsr.fypp b/src/data/dbcsr.fypp index 9fb499a28e0..5e648ac9fde 100644 --- a/src/data/dbcsr.fypp +++ b/src/data/dbcsr.fypp @@ -55,7 +55,7 @@ #:set carry = 1 #:for i in range(0,len(num)) #:set outi = 0 -#:if carry == 1 +#:if carry == 1 #:if num[i] == 0 #:set outi = 1 #:set carry = 0 @@ -63,7 +63,7 @@ #:set outi = 0 #:set carry = 1 #:endif -#:else +#:else #:set outi = num[i] #:endif #:mute @@ -86,11 +86,11 @@ $: numout.append(outi) #! generates a list of permutations from n entries #! example n = 2 -> [[0,0],[0,1],[1,0],[1,1]] where 0/1 means present/not present #:set idx = [] - #:set newidx = [] + #:set newidx = [] ${init(idx,n)}$ #:set imax = pow(2,n) - #:for i in range(0,imax) + #:for i in range(0,imax) $: permlist.append(idx) ${add_num(idx,newidx)}$ #:set idx = newidx @@ -104,8 +104,8 @@ $: numout.append(outi) #! generates permuted groups of variables from a variable list #! optional variables that appear together may be grouped #! example: varlist = [[var1], [var2,var3]] - #! this gives: vargroups = [ [[var1],[var2,var3]], [[var1]], [[var2,var3]], []] - #:set permlist = [] + #! this gives: vargroups = [ [[var1],[var2,var3]], [[var1]], [[var2,var3]], []] + #:set permlist = [] ${gen_permlist(permlist,len(varlist))}$ #:for p in permlist #:set group = [] @@ -141,7 +141,7 @@ $: prefix * (bool(len(group))) + ", ".join([str(i) + ' = ' + str(i) for i in fla #:def print_groupif(vargroups,varlist,i,check='PRESENT',prefix='') #! for a group [[var1]] and a varlist [[var1]],[var2,var3]] #! prints "(ELSE) IF (PRESENT(var1) .AND. .NOT. PRESENT(var2) .AND. .NOT. PRESENT(var3)) THEN" -#! to be used in a loop +#! to be used in a loop #:set group = vargroups[i] #:set diff = [item for item in varlist if item not in group] #:set stat = "ELSE IF" @@ -157,9 +157,9 @@ $: prefix * (bool(len(group))) + ", ".join([str(i) + ' = ' + str(i) for i in fla ${flatten(group,flatgroup)}$ ${flatten(diff,flatdiff)}$ #:endmute -$: stat + "(" + " .AND. ".join([check + "(" + prefix + str(i) + ")" for i in flatgroup]) & +$: stat + "(" + " .AND. ".join([check + "(" + prefix + str(i) + ")" for i in flatgroup]) & + " .AND. " * (bool(len(diff)) * bool(len(diff) - len(varlist))) & - + " .AND. ".join([".NOT. " + check + "(" + prefix + str(i) + ")" for i in flatdiff]) + ") THEN " + + " .AND. ".join([".NOT. " + check + "(" + prefix + str(i) + ")" for i in flatdiff]) + ") THEN " #:else ELSE #:endif diff --git a/src/dbcsr_api_c.F b/src/dbcsr_api_c.F index 1304cc03b9f..eeb8939c16b 100644 --- a/src/dbcsr_api_c.F +++ b/src/dbcsr_api_c.F @@ -1406,7 +1406,7 @@ SUBROUTINE c_dbcsr_get_${var}$ (c_matrix, c_${var}$, c_size) BIND(C, name="c_dbc END DO #:else DO i = 1, c_size - c_${var}$ (i) = ${var}$ (i); + c_${var}$ (i) = ${var}$ (i) END DO #:endif NULLIFY (${var}$) diff --git a/src/ops/PACKAGE b/src/ops/PACKAGE index 4684cb7adb0..c490ac0e14b 100644 --- a/src/ops/PACKAGE +++ b/src/ops/PACKAGE @@ -1,6 +1,6 @@ { "description": "High level DBCSR operations", "archive": "libdbcsr", -"requires": ["../acc", "../mpi", "../data", "../base", "../dist", +"requires": ["../acc", "../mpi", "../data", "../base", "../dist", "../block", "../utils", "../core", "../mm", "../work"], } diff --git a/src/tensors/dbcsr_tensor.fypp b/src/tensors/dbcsr_tensor.fypp index d66e4b9691b..061022abecd 100644 --- a/src/tensors/dbcsr_tensor.fypp +++ b/src/tensors/dbcsr_tensor.fypp @@ -127,7 +127,7 @@ $: ", ".join(["int** " + name + "_" + str(i) + ", " + "int* " + name + "_" + #:set carry = 1 #:for i in range(0,len(num)) #:set outi = 0 -#:if carry == 1 +#:if carry == 1 #:if num[i] == 0 #:set outi = 1 #:set carry = 0 @@ -135,7 +135,7 @@ $: ", ".join(["int** " + name + "_" + str(i) + ", " + "int* " + name + "_" + #:set outi = 0 #:set carry = 1 #:endif -#:else +#:else #:set outi = num[i] #:endif #:mute @@ -158,11 +158,11 @@ $: numout.append(outi) #! generates a list of permutations from n entries #! example n = 2 -> [[0,0],[0,1],[1,0],[1,1]] where 0/1 means present/not present #:set idx = [] - #:set newidx = [] + #:set newidx = [] ${init(idx,n)}$ #:set imax = pow(2,n) - #:for i in range(0,imax) + #:for i in range(0,imax) $: permlist.append(idx) ${add_num(idx,newidx)}$ #:set idx = newidx @@ -176,8 +176,8 @@ $: numout.append(outi) #! generates permuted groups of variables from a variable list #! optional variables that appear together may be grouped #! example: varlist = [[var1], [var2,var3]] - #! this gives: vargroups = [ [[var1],[var2,var3]], [[var1]], [[var2,var3]], []] - #:set permlist = [] + #! this gives: vargroups = [ [[var1],[var2,var3]], [[var1]], [[var2,var3]], []] + #:set permlist = [] ${gen_permlist(permlist,len(varlist))}$ #:for p in permlist #:set group = [] @@ -213,7 +213,7 @@ $: prefix * (bool(len(group))) + ", ".join([str(i) + ' = ' + str(i) for i in fla #:def print_groupif(vargroups,varlist,i,check='PRESENT',prefix='') #! for a group [[var1]] and a varlist [[var1]],[var2,var3]] #! prints "(ELSE) IF (PRESENT(var1) .AND. .NOT. PRESENT(var2) .AND. .NOT. PRESENT(var3)) THEN" -#! to be used in a loop +#! to be used in a loop #:set group = vargroups[i] #:set diff = [item for item in varlist if item not in group] #:set stat = "ELSE IF" @@ -229,9 +229,9 @@ $: prefix * (bool(len(group))) + ", ".join([str(i) + ' = ' + str(i) for i in fla ${flatten(group,flatgroup)}$ ${flatten(diff,flatdiff)}$ #:endmute -$: stat + "(" + " .AND. ".join([check + "(" + prefix + str(i) + ")" for i in flatgroup]) & +$: stat + "(" + " .AND. ".join([check + "(" + prefix + str(i) + ")" for i in flatgroup]) & + " .AND. " * (bool(len(diff)) * bool(len(diff) - len(varlist))) & - + " .AND. ".join([".NOT. " + check + "(" + prefix + str(i) + ")" for i in flatdiff]) + ") THEN " + + " .AND. ".join([".NOT. " + check + "(" + prefix + str(i) + ")" for i in flatdiff]) + ") THEN " #:else ELSE #:endif diff --git a/tools/build_libsmm/README b/tools/build_libsmm/README index aedac067eb5..12544806fe9 100644 --- a/tools/build_libsmm/README +++ b/tools/build_libsmm/README @@ -1,11 +1,11 @@ libsmm: a library for small matrix multiplies. In order to deal efficiently with small matrix multiplies, -often involving 'special' matrix dimensions such as 5,13,17,22, +often involving 'special' matrix dimensions such as 5,13,17,22, a dedicated matrix library can be generated that outperforms (or matches) general purpose (optimized) blas libraries. -Generation requires extensive compilation and timing runs, and is machine specific, +Generation requires extensive compilation and timing runs, and is machine specific, i.e. the library should be constructed on the architecture it is supposed to run. Users can modify the values inside the file config.in to set which kind of library @@ -18,7 +18,7 @@ Below you can find the detailed instructions for some examples. ==================================================================================================================== a) How to generate the library running several jobs in a cluster, where each - node allows for both execution and compilation. + node allows for both execution and compilation. For this example we will use a CRAY system with GNU compiler and SLURM. Run "./generate -h" to see the meaning of the options. @@ -34,11 +34,11 @@ a) How to generate the library running several jobs in a cluster, where each Then run: ./generate -c config/cray.gnu small2 This command collects all results produced in the small1 phase and it generates a file small_gen_optimal_dnn_cray.gnu.out - + 4) Run: ./generate -c config/cray.gnu -t 16 -w slurm lib This commman submit in batch a single job that compiles the library. At the end the library is produced inside the directory lib/ - (libsmm_dnn_cray.gnu.a). + (libsmm_dnn_cray.gnu.a). 5) It is highly recommended to run the final test to check the correctness of the library. Run: ./generate -c config/cray.gnu -j 20 -w slurm check1 @@ -51,15 +51,15 @@ a) How to generate the library running several jobs in a cluster, where each ==================================================================================================================== -b) How to generate the library running a single job interactively. +b) How to generate the library running a single job interactively. For this example we will use a Linux system with GNU compiler. Run "./generate -h" to see the meaning of the options. 1) Run: ./generate -c config/linux.gnu -j 10 -t 16 -w none tiny1 - This command generates, compiles and executes the tiny kernels + This command generates, compiles and executes the tiny kernels in 10 groups. Please increase the number of groups (-j <#> option) if you get the error "Argument list too long". - + 2) Run: ./generate -c config/linux.gnu tiny2 This command collects all results produced in the tiny1 phase and it generates a file tiny_gen_optimal_dnn_linux.gnu.out @@ -88,7 +88,7 @@ c) How to generate the library for the Intel Xeon Phi in batch mode. Run "./generate -h" to see the meaning of the options. We use the config file mic.intel (inside the directory config). Check if all options are OK for your case, in particular: - - the target_compile variable with the flag "-offload-attribute-target=mic". + - the target_compile variable with the flag "-offload-attribute-target=mic". - the target_compile_offload variable with the flag "-offload=mandatory". - Set the MIC_OMP_NUM_THREADS variable to the number of cores on the card. @@ -96,11 +96,11 @@ c) How to generate the library for the Intel Xeon Phi in batch mode. Phi. Performance output files are written in the same directory where the library is executed on the host, therefore this directory must be exported to the Xeon Phi with the right permission (read/write). - + 1) Run: ./generate -c config/mic.intel -j 100 -t 16 -w slurm tiny1 This command submits 100 jobs in batch. Each job offloads executions to the Intel Xeon Phi card (MIC_OMP_NUM_THREADS threads). Wait until - completion of all jobs. + completion of all jobs. 2) Run: ./generate -c config/mic.intel tiny2 This command collects all results of the tiny1 phase and it generates @@ -116,7 +116,7 @@ c) How to generate the library for the Intel Xeon Phi in batch mode. 4) Run: ./generate -c config/mic.intel -t 16 -w slurm lib This commman submit in batch a single job that compiles the library. At the end the library is produced inside the directory lib/ - (libsmm_dnn_mic.intel.a). + (libsmm_dnn_mic.intel.a). 5) It is highly recommended to run the final test to check the correctness of the library. Run: ./generate -c config/mic.intel -j 200 -w slurm check1 diff --git a/tools/build_libsmm/config.in b/tools/build_libsmm/config.in index dd6eb64844c..e2412d9985c 100644 --- a/tools/build_libsmm/config.in +++ b/tools/build_libsmm/config.in @@ -13,14 +13,14 @@ transpose_flavor=1 # 1) d => double precision real # 2) s => single precision real # 3) z => double precision complex -# 4) c => single precision complex +# 4) c => single precision complex # # select a data_type from the list 1 2 3 4 # data_type=1 # -# matrix dimensions for which optimized routines will be generated. +# matrix dimensions for which optimized routines will be generated. # since all combinations of M,N,K are being generated the size of the library becomes very large # if too many sizes are being optimized for. Numbers have to be ascending. # diff --git a/tools/build_libsmm/generate b/tools/build_libsmm/generate index 65e09747be6..91eb0a46911 100755 --- a/tools/build_libsmm/generate +++ b/tools/build_libsmm/generate @@ -49,7 +49,7 @@ show_help() { echo " all: generate, compile and run the kernels." echo " source: only generate the source file kernels." echo " compile: generate and compile the kernels." - echo " Default value is \"${def_target}\"." + echo " Default value is \"${def_target}\"." echo echo "COMMAND is one of the followings:" echo " tiny1 : it runs the tiny phase. Batch execution if requested." @@ -96,7 +96,7 @@ while getopts "c:hj:s:t:w:m:a:" OPTION; do case $OPTION in c) config_file=$OPTARG - ;; + ;; h) show_help ;; @@ -106,7 +106,7 @@ while getopts "c:hj:s:t:w:m:a:" OPTION; do ;; s) SIMD=$OPTARG - ;; + ;; t) ntasks=$OPTARG check_number $ntasks $OPTION $ntasks @@ -125,10 +125,10 @@ while getopts "c:hj:s:t:w:m:a:" OPTION; do *) echo "Warning: target \"$OPTARG\" unknown. Run ./generate -h for help." exit - ;; + ;; esac ;; - ?) + ?) exit ;; esac @@ -366,7 +366,7 @@ case "${transpose_flavor}" in tb="N" decl="A(K,M), B(K,N)" lds="LDA=K ; LDB=K" - ;; + ;; 3 ) type_label+="nt" ta="N" @@ -400,7 +400,7 @@ case $cmd in ;; check1|check2) run_dir+="_check" - + ;; esac run_dir+="${type_label}" diff --git a/tools/build_libsmm/generate.bash b/tools/build_libsmm/generate.bash index c8c7a7206bb..e3b2a1d1e6a 100644 --- a/tools/build_libsmm/generate.bash +++ b/tools/build_libsmm/generate.bash @@ -1,6 +1,6 @@ # # Author: Alfio Lazzaro, alfio.lazzaro@mat.ethz.ch (2013-2015) -# Library for the generate script used in LIBSMM library +# Library for the generate script used in LIBSMM library # # @@ -24,7 +24,7 @@ write_makefile_header() { printf "LIBSMM_INDICES = \$(wordlist \$(LIBSMM_SI),\$(LIBSMM_EI),\$(LIBSMM_DIMS_INDICES))\n\n" # - # output directory for compiled and results files + # output directory for compiled and results files # printf "LIBSMM_WORKDIR=${work_dir}\n\n" @@ -32,12 +32,12 @@ write_makefile_header() { # list of source files # printf "LIBSMM_SRCFILES=\$(patsubst %%,${prefix_file}_find_%%.f90,\$(LIBSMM_INDICES)) \n" - + # # list of executables # printf "LIBSMM_OBJFILES=\$(patsubst %%,\$(LIBSMM_WORKDIR)/${prefix_file}_find_%%.o,\$(LIBSMM_INDICES)) \n" - + # # list of output files # @@ -176,16 +176,16 @@ collect_results() { do_generate_tiny() { - # + # # skip the compilation part if it needs only to collect the results # if [ "$run_cmd" != "true" ]; then # # compile the generator of tiny mults # - ${host_compile} -c mults.f90 + ${host_compile} -c mults.f90 ${host_compile} mults.o tiny_gen.f90 -o tiny_gen.x - + # # for easy parallelism go via a Makefile # @@ -290,7 +290,7 @@ do_generate_small() { printf " \$@" printf " libxsmm_" printf '`echo $* | awk -F_ '\''{ print $$6"_"$$7"_"$$8" "$$6" "$$7" "$$8" "$$6" "$$8" "$$6 }'\''` ' - printf "1 1 0 0 ${SIMD_libxsmm} nopf ${data_libxsmm}" + printf "1 1 0 0 ${SIMD_libxsmm} nopf ${data_libxsmm}" fi printf "\n\n" ) > ${make_file} @@ -312,8 +312,8 @@ do_generate_lib() { echo "Abort execution." echo exit - fi - + fi + # # Check if small file exists # @@ -322,13 +322,13 @@ do_generate_lib() { echo "Abort execution." echo exit - fi + fi # # compile the generator of small mults # - ${host_compile} -c mults.f90 - ${host_compile} -c multrec_gen.f90 + ${host_compile} -c mults.f90 + ${host_compile} -c multrec_gen.f90 ${host_compile} mults.o multrec_gen.o lib_gen.f90 -o lib_gen.x # @@ -357,7 +357,7 @@ do_generate_lib() { eles="(/0" for i in `seq 1 $maxsize` do - + found=0 for myn in ${dims_small} do @@ -369,12 +369,12 @@ do_generate_lib() { count=$((count+1)) ele=$count else - ele=0 + ele=0 fi eles="$eles,$ele" done eles="$eles/)" - + cd ${run_dir} file="smm${type_label}.f90" @@ -401,7 +401,7 @@ do_generate_lib() { printf " IF (N<=$maxsize) THEN\n in=indx(N)\n ELSE\n in=0\n ENDIF\n" >> ${file} printf " IF (K<=$maxsize) THEN\n ik=indx(K)\n ELSE\n ik=0\n ENDIF\n" >> ${file} printf " itot=(ik*($numsize+1)+in)*($numsize+1)+im\n" >> ${file} - + count=0 printf " SELECT CASE(itot)\n" >> ${file} for myk in 0 ${dims_small} @@ -451,7 +451,7 @@ do_generate_lib() { printf "END SUBROUTINE smm${type_label}\n\n" >> ${file} fi } - + write_routine write_routine 0 @@ -498,7 +498,7 @@ do_generate_lib() { printf "LIBSMM_OBJFILES=\$(patsubst %%,\$(LIBSMM_WORKDIR)/smm${type_label}_%%.o,\$(LIBSMM_DIMS_INDICES)) \n\n" printf ".PHONY: \$(LIBSMM_WORKDIR)/\$(LIBSMM_DRIVER) all_libsmm \n\n" - + printf "all_libsmm: \n\n" # @@ -530,7 +530,7 @@ do_generate_lib() { printf " \$@" printf " libxsmm_" printf '`echo $* | awk -F_ '\''{ print $$4"_"$$5"_"$$6" "$$4" "$$5" "$$6" "$$4" "$$6" "$$4 }'\''` ' - printf "1 1 0 0 ${SIMD_libxsmm} nopf ${data_libxsmm}\n" + printf "1 1 0 0 ${SIMD_libxsmm} nopf ${data_libxsmm}\n" fi printf "\n" @@ -592,7 +592,7 @@ do_check() { exit fi - # + # # skip the compilation part if it needs only to collect the results # if [ "$run_cmd" != "true" ]; then @@ -621,11 +621,11 @@ do_check() { if [ $element_end -eq 0 -o $element -gt $element_end ]; then element_end=$(( element_end + nelements_in )) ijob=$(( ijob + 1)) - + if [ ${ijob} -le ${nelements_out} ]; then element_end=$(( element_end + 1)) fi - + echo "Preparing test program for job #$ijob..." filename=${test_file}_job$ijob @@ -715,7 +715,7 @@ SUBROUTINE testit(M,N,K) CALL MYRAND(C1) C2=C1 - CALL ${gemm}("$ta","$tb",M,N,K,one,A,LDA,B,LDB,one,C1,M) + CALL ${gemm}("$ta","$tb",M,N,K,one,A,LDA,B,LDB,one,C1,M) CALL smm${type_label}(M,N,K,A,B,C2) IF (MAXVAL(ABS(C2-C1))>100*EPSILON(REAL(1.0,KIND=KIND(A(1,1))))) THEN @@ -732,11 +732,11 @@ SUBROUTINE testit(M,N,K) A=0; B=0; C1=0 ; C2=0 - CALL CPU_TIME(t1) + CALL CPU_TIME(t1) DO i=1,Niter - CALL ${gemm}("$ta","$tb",M,N,K,one,A,LDA,B,LDB,one,C1,M) + CALL ${gemm}("$ta","$tb",M,N,K,one,A,LDA,B,LDB,one,C1,M) ENDDO - CALL CPU_TIME(t2) + CALL CPU_TIME(t2) CALL CPU_TIME(t3) DO i=1,Niter @@ -748,7 +748,7 @@ SUBROUTINE testit(M,N,K) " smm: ",Niter*flops/(t4-t3)/gflop," Gflops. Linked blas: ",Niter*flops/(t2-t1)/gflop,& " Gflops. Performance ratio: ",((t2-t1)/(t4-t3))*100,"%" -END SUBROUTINE +END SUBROUTINE EOF if [ -n "${target_compile_offload}" ]; then printf '!dir$ attributes offload:mic :: testit \n ' >> ${work_dir}/${filename}.f90 @@ -796,7 +796,7 @@ EOF rm -f ${work_dir}/${filename}.sh # - # Prepare the script for compile the benchmarking + # Prepare the script for compile the benchmarking # and testing program for the smm library # ( @@ -816,7 +816,7 @@ EOF ${run_cmd} ./${work_dir}/${filename}.sh fi - element=$(( element + 1 )) + element=$(( element + 1 )) done ; done ; done @@ -859,5 +859,5 @@ EOF echo "Final library can be found at ${archive}" echo fi - + } diff --git a/tools/build_libsmm/make.gen b/tools/build_libsmm/make.gen index fe06830b477..c297a85f821 100644 --- a/tools/build_libsmm/make.gen +++ b/tools/build_libsmm/make.gen @@ -97,7 +97,7 @@ $(LIBSMM_EXE:.x=.f90): done; \ printf "\n"; \ fi >> $@ - @printf "PROGRAM $(notdir $(basename $@)) \n" >> $@ + @printf "PROGRAM $(notdir $(basename $@)) \n" >> $@ @printf " USE omp_lib \n" >> $@ @if [ -n "$(LIBSMM_MIC_OFFLOAD)" ]; then printf " USE mic_lib \n" >> $@ ; fi @printf " IMPLICIT NONE \n" >> $@