Skip to content

Commit

Permalink
linting
Browse files Browse the repository at this point in the history
  • Loading branch information
tahaelbayad committed Jan 7, 2025
1 parent d1c2b8e commit bde9ad6
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 145 deletions.
1 change: 1 addition & 0 deletions Deeploy/Targets/Generic/TypeCheckers.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def _inferSignedness(self, inputs: List[VariableBuffer],
else:
return [False]


class RQAddChecker(SignPropTypeChecker):

def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
Expand Down
10 changes: 5 additions & 5 deletions Deeploy/Targets/PULPOpen/Bindings.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,17 @@
from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeTemplate
from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
from Deeploy.Targets.Generic.Templates import ConcatTemplate, RQSiGELUTemplate, iHardswishTemplate, RQAddTemplate
from Deeploy.Targets.Generic.Templates import ConcatTemplate, RQAddTemplate, RQSiGELUTemplate, iHardswishTemplate
from Deeploy.Targets.Generic.TypeCheckers import ConcatChecker, GELUChecker, HardswishChecker, MatMulChecker, \
MulChecker, ReduceMeanChecker, RQHardswishChecker, SliceChecker, SoftmaxChecker, TransposeChecker, \
iLayerNormChecker, RQAddChecker
MulChecker, ReduceMeanChecker, RQAddChecker, RQHardswishChecker, SliceChecker, SoftmaxChecker, TransposeChecker, \
iLayerNormChecker
from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass
from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling
from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling
from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture
from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, \
MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, RQSiHardswishTemplate, SliceTemplate, \
TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate
MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, RQSiHardswishTemplate, SliceTemplate, TallGEMMTemplate, \
TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate
from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \
PULPRequantShiftChecker
from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement
Expand Down
6 changes: 3 additions & 3 deletions Deeploy/Targets/PULPOpen/Platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@
PadLayer, ReduceMeanLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, RQSiHardswishLayer, \
SliceLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer, iSoftmaxLayer
from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, FlattenParser, GatherParser, MatMulParser, \
MulParser, Pad1DParser, Pad2DParser, ReduceMeanParser, RequantShiftParser, ReshapeParser, RQIntegerDivParser, \
RQSiGELUParser, RQSiHardswishParser, SliceParser, TransposeParser, UniformRequantShiftParser, UnsqueezeParser, \
iHardswishParser, iRMSNormParser, iSoftmaxParser, RQAddParser
MulParser, Pad1DParser, Pad2DParser, ReduceMeanParser, RequantShiftParser, ReshapeParser, RQAddParser, \
RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, SliceParser, TransposeParser, UniformRequantShiftParser, \
UnsqueezeParser, iHardswishParser, iRMSNormParser, iSoftmaxParser
from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate
from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import IntegerDivRequantMergePass, \
MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, SkipEmptyConcatPass, \
Expand Down
4 changes: 2 additions & 2 deletions Deeploy/Targets/Snitch/Bindings.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@
from Deeploy.CommonExtensions.DataTypes import int8_t, int32_t, uint8_t
from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
from Deeploy.Targets.Generic.Templates import iNoNormTemplate, RQAddTemplate
from Deeploy.Targets.Generic.TypeCheckers import AddChecker, GEMMChecker, SoftmaxChecker, iNoNormChecker, RQAddChecker
from Deeploy.Targets.Generic.Templates import RQAddTemplate, iNoNormTemplate
from Deeploy.Targets.Generic.TypeCheckers import AddChecker, GEMMChecker, RQAddChecker, SoftmaxChecker, iNoNormChecker
from Deeploy.Targets.Snitch.CodeTransformationPasses import SnitchClusterTiling, SnitchCoreFilterPass, \
SnitchProfileExecutionBlockPass, SnitchSynchCoresPass
from Deeploy.Targets.Snitch.Templates import AddTemplate, iSoftmaxTemplate
Expand Down
273 changes: 138 additions & 135 deletions TargetLibraries/Snitch/src/pulp_nn_add_i8_i8_i8.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,183 +20,186 @@

#include "DeeploySnitchMath.h"

void __attribute__((noinline))
pulp_nn_add_i8_i8_i8(int8_t *pIn1, int8_t *pIn2, int8_t *pOut, int32_t in1_mul,
int32_t in1_add, uint16_t in1_shift, int32_t in2_mul,
int32_t in2_add, uint16_t in2_shift, int32_t out_mul,
int32_t out_add, uint16_t out_shift, uint16_t dim_im_in_x,
uint16_t dim_im_in_y, uint16_t ch_im_in,
int out_requant_flag) {
int core_id = snrt_global_compute_core_idx();
int n_cores = snrt_global_compute_core_num();

if (dim_im_in_y < n_cores) {
n_cores = dim_im_in_y;
}

int Log2Core = INT_LOG2(n_cores);
int chunck = (dim_im_in_y >> Log2Core) + ((dim_im_in_y & (n_cores - 1)) != 0);

int32_t in1_rq1, in1_rq2, in1_rq3, in1_rq4, in2_rq1, in2_rq2, in2_rq3,
in2_rq4;
int32_t sum1, sum2, sum3, sum4;
int32_t sum_out1, sum_out2, sum_out3, sum_out4;
int32_t out1, out2, out3, out4, sum_int1, sum_int2, sum_int3, sum_int4;

int ch_im_in1_r = ch_im_in >> 0;
int ch_im_in2_r = ch_im_in >> 0;
int ch_im_out_r = ch_im_in >> 0;

void __attribute__ ((noinline)) pulp_nn_add_i8_i8_i8(
int8_t * pIn1,
int8_t * pIn2,
int8_t * pOut,
int32_t in1_mul,
int32_t in1_add,
uint16_t in1_shift,
int32_t in2_mul,
int32_t in2_add,
uint16_t in2_shift,
int32_t out_mul,
int32_t out_add,
uint16_t out_shift,
uint16_t dim_im_in_x,
uint16_t dim_im_in_y,
uint16_t ch_im_in,
int out_requant_flag)
{
int core_id = snrt_global_compute_core_idx();
int n_cores = snrt_global_compute_core_num();

if (dim_im_in_y < n_cores){
n_cores = dim_im_in_y;
}

int Log2Core = INT_LOG2(n_cores);
int chunck = (dim_im_in_y >> Log2Core) + ((dim_im_in_y & (n_cores - 1)) != 0);

int32_t in1_rq1, in1_rq2, in1_rq3, in1_rq4,
in2_rq1, in2_rq2, in2_rq3, in2_rq4;
int32_t sum1, sum2, sum3, sum4;
int32_t sum_out1, sum_out2, sum_out3, sum_out4;
int32_t out1, out2, out3, out4,
sum_int1, sum_int2, sum_int3, sum_int4;


int start = MIN(chunck * core_id, dim_im_in_y);
int stop = MIN(start + chunck, dim_im_in_y);

int ch_im_in1_r = ch_im_in >> 0;
int ch_im_in2_r = ch_im_in >> 0;
int ch_im_out_r = ch_im_in >> 0;
int8_t *target1 = pIn1 + start * ch_im_in1_r * dim_im_in_x;
int8_t *target2 = pIn2 + start * ch_im_in2_r * dim_im_in_x;
int8_t *pOutBuffer = pOut + start * ch_im_out_r * dim_im_in_x;

int start = MIN(chunck * core_id, dim_im_in_y);
int stop = MIN(start + chunck, dim_im_in_y);
int a = 0;
int b = 0;

int8_t *target1 = pIn1 + start * ch_im_in1_r * dim_im_in_x;
int8_t *target2 = pIn2 + start * ch_im_in2_r * dim_im_in_x;
int8_t *pOutBuffer = pOut + start * ch_im_out_r * dim_im_in_x;
int8_t *target1_ext = &a;
int8_t *target2_ext = &b;

int a = 0;
int b = 0;
for (int i = 0; i < (((stop - start) * ch_im_out_r * dim_im_in_x) >> 2);
i++) {
target1_ext = target1;
target1 += 4;

int8_t *target1_ext = &a;
int8_t *target2_ext = &b;

for (int i=0; i<(((stop-start) * ch_im_out_r * dim_im_in_x) >> 2); i++)
{
target1_ext = target1;
target1+=4;

target2_ext = target2;
target2+=4;
target2_ext = target2;
target2 += 4;
#ifdef ADD_VERBOSE
printf("core %d - in1 it0 before requant: %d\n", core_id, *(target1_ext));
printf("core %d - in2 it0 before requant: %d\n", core_id, *(target2_ext));
printf("core %d - in1 it0 before requant: %d\n", core_id, *(target1_ext));
printf("core %d - in2 it0 before requant: %d\n", core_id, *(target2_ext));
#endif
in1_rq1 = ((*(target1_ext)) * in1_mul + in1_add) >> in1_shift;
in2_rq1 = ((*(target2_ext)) * in2_mul + in2_add) >> in2_shift;
sum1 = clips8(in1_rq1) + clips8(in2_rq1);
in1_rq1 = ((*(target1_ext)) * in1_mul + in1_add) >> in1_shift;
in2_rq1 = ((*(target2_ext)) * in2_mul + in2_add) >> in2_shift;
sum1 = clips8(in1_rq1) + clips8(in2_rq1);
#ifdef ADD_VERBOSE
printf("core %d - in1_rq1 it0 after requant: %d\nclipped in1_rq1: %d\n", core_id, in1_rq1, clips8(in1_rq1));
printf("core %d - in2_rq1 it0 after requant: %d\nclipped in2_rq1: %d\n", core_id, in2_rq1), clips8(in2_rq1);
printf("core %d - sum1: %d\n", core_id, sum1);
printf("core %d - in1_rq1 it0 after requant: %d\nclipped in1_rq1: %d\n",
core_id, in1_rq1, clips8(in1_rq1));
printf("core %d - in2_rq1 it0 after requant: %d\nclipped in2_rq1: %d\n",
core_id, in2_rq1),
clips8(in2_rq1);
printf("core %d - sum1: %d\n", core_id, sum1);
#endif
#ifdef ADD_VERBOSE
printf("core %d - in1 it1 before requant: %d\n", core_id, *(target1_ext + 1 ));
printf("core %d - in2 it1 before requant: %d\n", core_id, *(target2_ext + 1 ));
printf("core %d - in1 it1 before requant: %d\n", core_id,
*(target1_ext + 1));
printf("core %d - in2 it1 before requant: %d\n", core_id,
*(target2_ext + 1));
#endif
in1_rq2 = ((*(target1_ext + 1 )) * in1_mul + in1_add) >> in1_shift;
in2_rq2 = ((*(target2_ext + 1 )) * in2_mul + in2_add) >> in2_shift;
sum2 = clips8(in1_rq2) + clips8(in2_rq2);
in1_rq2 = ((*(target1_ext + 1)) * in1_mul + in1_add) >> in1_shift;
in2_rq2 = ((*(target2_ext + 1)) * in2_mul + in2_add) >> in2_shift;
sum2 = clips8(in1_rq2) + clips8(in2_rq2);
#ifdef ADD_VERBOSE
printf("core %d - in1_rq2 it1 after requant: %d\nclipped in1_rq2: %d\n", core_id, in1_rq2, clips8(in1_rq2));
printf("core %d - in2_rq2 it1 after requant: %d\nclipped in2_rq2: %d\n", core_id, in2_rq2), clips8(in2_rq2);
printf("core %d - sum2: %d\n", core_id, sum2);
printf("core %d - in1_rq2 it1 after requant: %d\nclipped in1_rq2: %d\n",
core_id, in1_rq2, clips8(in1_rq2));
printf("core %d - in2_rq2 it1 after requant: %d\nclipped in2_rq2: %d\n",
core_id, in2_rq2),
clips8(in2_rq2);
printf("core %d - sum2: %d\n", core_id, sum2);
#endif
#ifdef ADD_VERBOSE
printf("core %d - in1 it2 before requant: %d\n", core_id, *(target1_ext + 2 ));
printf("core %d - in2 it2 before requant: %d\n", core_id, *(target2_ext + 2 ));
printf("core %d - in1 it2 before requant: %d\n", core_id,
*(target1_ext + 2));
printf("core %d - in2 it2 before requant: %d\n", core_id,
*(target2_ext + 2));
#endif
in1_rq3 = ((*(target1_ext + 2 )) * in1_mul + in1_add) >> in1_shift;
in2_rq3 = ((*(target2_ext + 2 )) * in2_mul + in2_add) >> in2_shift;
sum3 = clips8(in1_rq3) + clips8(in2_rq3);
in1_rq3 = ((*(target1_ext + 2)) * in1_mul + in1_add) >> in1_shift;
in2_rq3 = ((*(target2_ext + 2)) * in2_mul + in2_add) >> in2_shift;
sum3 = clips8(in1_rq3) + clips8(in2_rq3);
#ifdef ADD_VERBOSE
printf("core %d - in1_rq3 it2 after requant: %d\nclipped in1_rq3: %d\n", core_id, in1_rq3, clips8(in1_rq3));
printf("core %d - in2_rq3 it2 after requant: %d\nclipped in2_rq3: %d\n", core_id, in2_rq3), clips8(in2_rq3);
printf("core %d - sum3: %d\n", core_id, sum3);
printf("core %d - in1_rq3 it2 after requant: %d\nclipped in1_rq3: %d\n",
core_id, in1_rq3, clips8(in1_rq3));
printf("core %d - in2_rq3 it2 after requant: %d\nclipped in2_rq3: %d\n",
core_id, in2_rq3),
clips8(in2_rq3);
printf("core %d - sum3: %d\n", core_id, sum3);
#endif
#ifdef ADD_VERBOSE
printf("core %d - in1 it3 before requant: %d\n", core_id, *(target1_ext + 3 ));
printf("core %d - in2 it3 before requant: %d\n", core_id, *(target2_ext + 3 ));
printf("core %d - in1 it3 before requant: %d\n", core_id,
*(target1_ext + 3));
printf("core %d - in2 it3 before requant: %d\n", core_id,
*(target2_ext + 3));
#endif
in1_rq4 = ((*(target1_ext + 3 )) * in1_mul + in1_add) >> in1_shift;
in2_rq4 = ((*(target2_ext + 3 )) * in2_mul + in2_add) >> in2_shift;
sum4 = clips8(in1_rq4) + clips8(in2_rq4);
in1_rq4 = ((*(target1_ext + 3)) * in1_mul + in1_add) >> in1_shift;
in2_rq4 = ((*(target2_ext + 3)) * in2_mul + in2_add) >> in2_shift;
sum4 = clips8(in1_rq4) + clips8(in2_rq4);
#ifdef ADD_VERBOSE
printf("core %d - in1_rq4 it3 after requant: %d\nclipped in1_rq4: %d\n", core_id, in1_rq4, clips8(in1_rq4));
printf("core %d - in2_rq4 it3 after requant: %d\nclipped in2_rq4: %d\n", core_id, in2_rq4), clips8(in2_rq4);
printf("core %d - sum4: %d\n", core_id, sum4);
printf("core %d - in1_rq4 it3 after requant: %d\nclipped in1_rq4: %d\n",
core_id, in1_rq4, clips8(in1_rq4));
printf("core %d - in2_rq4 it3 after requant: %d\nclipped in2_rq4: %d\n",
core_id, in2_rq4),
clips8(in2_rq4);
printf("core %d - sum4: %d\n", core_id, sum4);
#endif

if (out_requant_flag) {
sum1 = (sum1 * out_mul + out_add) >> out_shift;
if (out_requant_flag) {
sum1 = (sum1 * out_mul + out_add) >> out_shift;
#ifdef ADD_VERBOSE
printf("core %d - requantized sum1: %d\n", core_id, sum1);
printf("core %d - requantized sum1: %d\n", core_id, sum1);
#endif
sum2 = (sum2 * out_mul + out_add) >> out_shift;
sum2 = (sum2 * out_mul + out_add) >> out_shift;
#ifdef ADD_VERBOSE
printf("core %d - requantized sum2: %d\n", core_id, sum2);
printf("core %d - requantized sum2: %d\n", core_id, sum2);
#endif
sum3 = (sum3 * out_mul + out_add) >> out_shift;
sum3 = (sum3 * out_mul + out_add) >> out_shift;
#ifdef ADD_VERBOSE
printf("core %d - requantized sum3: %d\n", core_id, sum3);
printf("core %d - requantized sum3: %d\n", core_id, sum3);
#endif
sum4 = (sum4 * out_mul + out_add) >> out_shift;
sum4 = (sum4 * out_mul + out_add) >> out_shift;
#ifdef ADD_VERBOSE
printf("core %d - requantized sum4: %d\n", core_id, sum4);
printf("core %d - requantized sum4: %d\n", core_id, sum4);
#endif
}
out1 = clips8(sum1);
}
out1 = clips8(sum1);
#ifdef ADD_VERBOSE
printf("core %d - out1 clipped: %d\n", core_id, out1);
printf("core %d - out1 clipped: %d\n", core_id, out1);
#endif
out2 = clips8(sum2);
out2 = clips8(sum2);
#ifdef ADD_VERBOSE
printf("core %d - out2 clipped: %d\n", core_id, out2);
printf("core %d - out2 clipped: %d\n", core_id, out2);
#endif
out3 = clips8(sum3);
out3 = clips8(sum3);
#ifdef ADD_VERBOSE
printf("core %d - out3 clipped: %d\n", core_id, out3);
printf("core %d - out3 clipped: %d\n", core_id, out3);
#endif
out4 = clips8(sum4);
out4 = clips8(sum4);
#ifdef ADD_VERBOSE
printf("core %d - out4 clipped: %d\n", core_id, out4);
printf("core %d - out4 clipped: %d\n", core_id, out4);
#endif

*pOutBuffer = (int8_t)out1;
pOutBuffer++;
*pOutBuffer = (int8_t)out2;
pOutBuffer++;
*pOutBuffer = (int8_t)out3;
pOutBuffer++;
*pOutBuffer = (int8_t)out4;
pOutBuffer++;
}
// SCHEREMO: Cleanup leftovers, not doing it with this codebase for sub-byte
// formats
for (int i = 0; i < (((stop - start) * ch_im_out_r * dim_im_in_x) % 4); i++) {
in1_rq1 = ((*(target1)) * in1_mul + in1_add) >> in1_shift;
in2_rq1 = ((*(target2)) * in2_mul + in2_add) >> in2_shift;

*pOutBuffer = (int8_t) out1;
pOutBuffer++;
*pOutBuffer = (int8_t) out2;
pOutBuffer++;
*pOutBuffer = (int8_t) out3;
pOutBuffer++;
*pOutBuffer = (int8_t) out4;
pOutBuffer++;
// SCHEREMO: Maybe it's just LLVM, but unless I hack 3 non-unrolled nops in
// here, stuff fails
#pragma nounroll
for (int j = 0; j < 3; j++) {
asm volatile("nop" ::);
}
// SCHEREMO: Cleanup leftovers, not doing it with this codebase for sub-byte formats
for (int i=0; i<(((stop-start) * ch_im_out_r * dim_im_in_x) % 4); i++){
in1_rq1 = ((*(target1)) * in1_mul + in1_add) >> in1_shift;
in2_rq1 = ((*(target2)) * in2_mul + in2_add) >> in2_shift;

// SCHEREMO: Maybe it's just LLVM, but unless I hack 3 non-unrolled nops in here, stuff fails
#pragma nounroll
for (int j = 0; j < 3; j++) {
asm volatile("nop" ::);
}

target1++;
target2++;
sum1 = clips8(in1_rq1) + clips8(in2_rq1);
if (out_requant_flag) {
sum1 = (sum1 * out_mul + out_add) >> out_shift;
}

out1 = clips8(sum1);
*pOutBuffer = (int8_t)out1;
pOutBuffer++;

target1++;
target2++;
sum1 = clips8(in1_rq1) + clips8(in2_rq1);
if (out_requant_flag) {
sum1 = (sum1 * out_mul + out_add) >> out_shift;
}

out1 = clips8(sum1);
*pOutBuffer = (int8_t)out1;
pOutBuffer++;
}
}

0 comments on commit bde9ad6

Please sign in to comment.