Add Float Support & Float GEMM for Generic (#25)

1. Float Support for Constbuffer 2. Float GEMM on Generic Target 3. Added FP GEMM to CI 4. Fixed Float bug on Testslice, CMSIS TestUtil, DivInterger 5. Fixed AbstractDatayType Float Bugs Co-authored-by: Victor Jung <[email protected]>
pulp-platform · Jan 16, 2025 · bb7e56d · bb7e56d
1 parent 3c64bda
commit bb7e56d
Show file tree

Hide file tree

Showing 15 changed files with 142 additions and 36 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -49,6 +49,7 @@ jobs:
         test2DRequantizedConv
         iSoftmax
         FloatAdder
+        testFloatGEMM
 
   generic-models:
     uses: ./.github/workflows/TestRunnerGeneric.yml

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@
 ### Fixed
 - Fix broken softmax kernel for generic platform ([#2](https://github.com/pulp-platform/Deeploy/pull/2)).
 
+
 ## Minor CI and Readme Improvements
 
 ### Added
@@ -16,6 +17,7 @@
 - Update the link of the Docker container used to run the CI with the Docker published by this repo instead of my fork.
 - Add a retry on timeout step for large network tests. This is a temporary fix to address the sporadic freeze happening at the compilation stage, see [this issue](https://github.com/pulp-platform/Deeploy/issues/9).
 
+
 ## Floating Point Support
 
 ### Added
@@ -26,6 +28,7 @@
 - Extend `testType.py` with float tests
 - LIMITATION: Current LLVM compiler does not support bfp16 and fp16, these types are commented in the library header
 
+
 ## Snitch Cluster Support
 
 ### Added
@@ -38,6 +41,7 @@
 ### Changed
 - Update the Banshee's commit to include a recent PR.
 
+
 ## Snitch Cluster Tiling Support
 
 ### Added
@@ -53,6 +57,7 @@
 ### Changed
 - Add the possibility of changing the simulator when using the snitch-tiled test runner.
 
+
 ## GVSOC support for the Snitch Cluster Platform
 
 ### Added
@@ -62,3 +67,11 @@
 
 ### Changed 
 - Add the RTL library to the snitch_cluster build process in the Makefile, required for GVSOC simulation
+
+
+## Add Float Support & Float GEMM for Generic
+- Float Support for Constbuffer
+- Float GEMM on Generic
+- Added FP GEMM to CI
+- Fixed Float bug on Testslice, CMSIS TestUtil, DivInterger
+- Fixed AbstractDatayType Float Bugs
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -6,3 +6,4 @@ All contributors have agreed to an open-source release of their work in the Deep
 * Luka Macan
 * Alberto Dequino
 * Francesco Conti
+* Run Wang
diff --git a/Deeploy/AbstractDataTypes.py b/Deeploy/AbstractDataTypes.py
@@ -217,7 +217,7 @@ def partialOrderUpcast(cls, otherCls: Type[Immediate]) -> bool:
             return False
 
     @classmethod
-    def checkValue(cls, value: Union[int, Iterable[int]], ctxt: Optional[_NetworkContext] = None):
+    def checkValue(cls, value: Union[int, Iterable[int], np.ndarray], ctxt: Optional[_NetworkContext] = None):
 
         if isinstance(value, int):
             _max, _min = (value, value)
@@ -238,6 +238,7 @@ def checkValue(cls, value: Union[int, Iterable[int]], ctxt: Optional[_NetworkCon
 class FloatImmediate(Immediate[Union[float, Iterable[float]], _ImmediateType]):
     typeMantissa: int  #: int: Represents the number of bits reserved for the mantissa part
     typeExponent: int  #: int: Represents the number of bits reserved for the exponent part
+    typeMin: float
 
     @_classproperty
     def typeExponentMax(cls) -> int:
@@ -249,6 +250,10 @@ def typeExponentOffset(cls) -> int:
         # The offset added to the exponent
         return 2**(cls.typeExponent - 1) - 1
 
+    @_classproperty
+    def typeMin(cls) -> float:
+        return -math.inf
+
     @classmethod
     def partialOrderUpcast(cls, otherCls: Type[Immediate]) -> bool:
         if issubclass(otherCls, FloatImmediate):
@@ -257,7 +262,7 @@ def partialOrderUpcast(cls, otherCls: Type[Immediate]) -> bool:
             return False
 
     @classmethod
-    def checkValue(cls, value: Union[float, Iterable[float]], ctxt: Optional[_NetworkContext] = None):
+    def checkValue(cls, value: Union[float, Iterable[float], np.ndarray], ctxt: Optional[_NetworkContext] = None):
         """
         This method tries to manually cast standard python's standard immediate float precision values 
         (64 bits) to an arbitrary FP representation and check if the new representation is close enough 
@@ -268,7 +273,7 @@ def checkValue(cls, value: Union[float, Iterable[float]], ctxt: Optional[_Networ
         if isinstance(value, float):
             _val_list.append(value)
         elif isinstance(value, np.ndarray):
-            _val_list = value.tolist()
+            _val_list = value.flatten().tolist()
         elif isinstance(value, Iterable):
             for i in value:
                 _val_list.append(i)

diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
@@ -385,9 +385,9 @@ class ConstantBuffer(VariableBuffer):
     def __init__(self, name: str = '', shape = [1], values = [0]):
         super().__init__(name, shape)
         values = np.asarray(values)
-        intArray = values.astype(int)
-        assert (np.abs(values - intArray)).max() < 0.001, "Constant value {name} is NOT an integer!"
-        self.values = intArray  #: np.array: Stores the underlying weights in Ptyhon-type representation
+        # intArray = values.astype(int)
+        # assert (np.abs(values - intArray)).max() < 0.001, "Constant value {name} is NOT an integer!"
+        self.values = values  #: np.array: Stores the underlying weights in Python-type representation
 
         # Do not override - ConstantBuffers are assumed to be always live!
         self._live = True

diff --git a/Deeploy/Targets/CortexM/Templates/CMSISUtils.py b/Deeploy/Targets/CortexM/Templates/CMSISUtils.py
@@ -191,10 +191,10 @@ def bindFCParams(ctxt,
     if isinstance(mul, str):
         __mul = ctxt.lookup(mul).values
         assert np.ndim(__mul) == 0, "Mul is not scalar!"
-        _mul = __mul.item()
+        _mul = int(__mul.item())
         ctxt.lookup(mul)._deploy = False
     else:
-        _mul = mul
+        _mul = int(mul)
 
     if isinstance(shift, str):
         __shift = ctxt.lookup(shift).values

diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py
@@ -30,15 +30,16 @@
 from Deeploy.AbstractDataTypes import PointerClass
 from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
     MemoryManagementGeneration, MemoryPassthroughGeneration
-from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes, SignedIntegerDataTypes, int8_t, \
-    int32_t, uint8_t
+from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes, SignedIntegerDataTypes, float32_t, \
+    int8_t, int32_t, uint8_t
 from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
 from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
 from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, ConvTemplate, DebugPrintTemplate, \
-    DummyTemplate, DWConvTemplate, FloatAddTemplate, GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, \
-    ITAPartialMaxTemplate, MatMulTemplate, MaxPoolTemplate, MulTemplate, PadTemplate, ReduceMeanTemplate, \
-    ReduceSumTemplate, RequantShiftTemplate, ReshapeTemplate, RQIntegerDivTemplate, RQSiGELUTemplate, SliceTemplate, \
-    TransposeTemplate, iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate
+    DummyTemplate, DWConvTemplate, FloatAddTemplate, FloatGemmTemplate, GatherTemplate, GemmTemplate, \
+    IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, MatMulTemplate, MaxPoolTemplate, MulTemplate, \
+    PadTemplate, ReduceMeanTemplate, ReduceSumTemplate, RequantShiftTemplate, ReshapeTemplate, RQIntegerDivTemplate, \
+    RQSiGELUTemplate, SliceTemplate, TransposeTemplate, iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, \
+    iSoftmaxTemplate
 from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DebugPrintChecker, \
     DummyChecker, FloatAddChecker, GatherChecker, GELUChecker, GEMMChecker, IntegerDivChecker, MatMulChecker, \
     MaxPoolChecker, MulChecker, PadChecker, ReduceMeanChecker, ReduceSumChecker, RequantShiftChecker, ReshapeChecker, \
@@ -96,10 +97,16 @@
 BasicGELUBinding = NodeBinding(GELUChecker([PointerClass(int8_t)], [PointerClass(int32_t)]),
                                iGELUTemplate.referenceTemplate, BasicTransformer)
 
-BasicGEMMBinding = NodeBinding(
-    GEMMChecker(
-        [PointerClass(int8_t), PointerClass(int8_t), PointerClass(int32_t)], [PointerClass(int32_t)]),
-    GemmTemplate.referenceTemplate, BasicTransformer)
+BasicGEMMBindings = [
+    NodeBinding(
+        GEMMChecker([PointerClass(int8_t), PointerClass(int8_t),
+                     PointerClass(int32_t)], [PointerClass(int32_t)]), GemmTemplate.referenceTemplate, BasicTransformer)
+] + [
+    NodeBinding(
+        GEMMChecker([PointerClass(float32_t), PointerClass(float32_t),
+                     PointerClass(float32_t)], [PointerClass(float32_t)]), FloatGemmTemplate.referenceTemplate,
+        BasicTransformer)
+]
 
 BasicIntegerDivBinding = NodeBinding(
     IntegerDivChecker([PointerClass(int32_t), PointerClass(int32_t)], [PointerClass(int32_t)]),

diff --git a/Deeploy/Targets/Generic/Platform.py b/Deeploy/Targets/Generic/Platform.py
@@ -29,7 +29,7 @@
     StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
 from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicConv1DBinding, BasicConv2DBinding, \
     BasicDebugPrintBindings, BasicDWConv1DBinding, BasicDWConv2DBinding, BasicGatherBindings, BasicGELUBinding, \
-    BasicGEMMBinding, BasicIntegerDivBinding, BasicITAPartialSoftmaxBinding, BasicITASoftmaxBinding, \
+    BasicGEMMBindings, BasicIntegerDivBinding, BasicITAPartialSoftmaxBinding, BasicITASoftmaxBinding, \
     BasicLayerNormBinding, BasicMatMulBinding, BasicMaxPool2DBinding, BasicMulBindings, BasicPad1DBindings, \
     BasicPad2DBindings, BasicReduceMeanBindings, BasicReduceSumBindings, BasicReshapeBindings, \
     BasicRQIntegerDivBinding, BasicRQSBindings, BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBinding, \
@@ -56,7 +56,7 @@
 FlattenMapper = NodeMapper(FlattenParser(), BasicReshapeBindings)
 GatherMapper = NodeMapper(GatherParser(), BasicGatherBindings)
 GELUMapper = NodeMapper(iGELUParser(), [BasicGELUBinding])
-GEMMMapper = NodeMapper(GenericGEMMParser(), [BasicGEMMBinding])
+GEMMMapper = NodeMapper(GenericGEMMParser(), BasicGEMMBindings)
 iLayerNormMapper = NodeMapper(iLayerNormParser(), [BasicLayerNormBinding])
 IntegerDivMapper = NodeMapper(IntegerDivParser(), [BasicIntegerDivBinding])
 ITAMaxMapper = NodeMapper(ITAMaxParser(), [BasicITASoftmaxBinding])

diff --git a/Deeploy/Targets/Generic/Templates/FloatGemmTemplate.py b/Deeploy/Targets/Generic/Templates/FloatGemmTemplate.py
@@ -0,0 +1,76 @@
+# ----------------------------------------------------------------------
+#
+# File: GemmTemplate.py.py
+#
+# Last edited: 05.01.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _FloatGemmTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        A = ctxt.lookup(operatorRepresentation['A'])
+        B = ctxt.lookup(operatorRepresentation['B'])
+        C = ctxt.lookup(operatorRepresentation['C'])
+        Y = ctxt.lookup(operatorRepresentation['data_out'])
+
+        operatorRepresentation['A_offset'] = 0
+        operatorRepresentation['B_offset'] = 0
+        operatorRepresentation['C_offset'] = 0
+        operatorRepresentation['Y_offset'] = 0
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _FloatGemmTemplate("""
+// GEMM float (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    ${A_type.typeName} ref_${data_out}_${A} = ${A};
+    ${B_type.typeName} ref_${data_out}_${B} = ${B};
+    ${C_type.typeName} ref_${data_out}_${C} = ${C};
+    ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+    for(uint32_t i=0; i<${batch}; i++){
+        for(uint32_t m=0; m<${M}; m++){
+            for(uint32_t n=0; n<${O}; n++){
+                ref_${data_out}_${data_out}[m* ${O} + n] = ref_${data_out}_${C}[m * ${O} + n];
+                for(uint32_t k=0; k<${N}; k++){
+                    ref_${data_out}_${data_out}[m* ${O} + n] += ref_${data_out}_${A}[m * ${N} + k] * ref_${data_out}_${B}[k * ${O} + n];
+                }
+            }
+        }
+
+        ref_${data_out}_${A} += ${M} * ${O};
+        ref_${data_out}_${B} += ${O} * ${N};
+        ref_${data_out}_${C} += ${M} * ${N};
+        ref_${data_out}_${data_out} += ${M} * ${N};
+    }
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/SliceTemplate.py b/Deeploy/Targets/Generic/Templates/SliceTemplate.py
@@ -67,12 +67,12 @@ def alignToContext(self, ctxt: NetworkContext,
 for dim in data_in_shape[1:]:
      dimSteps.append(dimSteps[-1]//dim)
 %>
-<%
-transferSize = dimSteps[axes[-1]]
+<%                                   
+transferSize = dimSteps[int(axes[-1])]
 %>
 <%
-if axes[0] > 0:
-    preAxes = list(range(axes[0]))
+if int(axes[0]) > 0:
+    preAxes = list(range(int(axes[0])))
 else:
     preAxes = []
 %>
@@ -100,7 +100,7 @@ def alignToContext(self, ctxt: NetworkContext,
 % endfor
 memcpy(ref_${data_out}, ${data_in} + ${data_out}_offset_${axis}, ${transferSize* data_out_type.referencedType.typeWidth//8});
 ref_${data_out} += ${transferSize};
-% for axis in range(axes[-1]+1):
+% for axis in range(int(axes[-1])+1):
 }
 % endfor
 """)
diff --git a/Deeploy/Targets/PULPOpen/Templates/UniformRequantShiftTemplate.py b/Deeploy/Targets/PULPOpen/Templates/UniformRequantShiftTemplate.py
@@ -77,8 +77,10 @@ def alignToContext(self, ctxt: NetworkContext,
 
 inSignage = "s" if signedI else "u"
 outSignage = "s" if signedO else "u"
+mul_intimmediate = int(mul_immediate)
+add_intimmediate = int(add_immediate)                    
 %>
 
 // UniformRequantShift (Name: ${nodeName}, Op: ${nodeOp})
-UniformRequantShift_${inSignage}${data_in_type.referencedType.typeWidth}_${outSignage}${data_out_type.referencedType.typeWidth}(${data_in}, ${size}, ${mul_immediate}, ${add_immediate}, ${data_out}, ${log2Dstring}, ${channel_width}, 0, 0 , ${output_min}, ${output_max}, 1);
+UniformRequantShift_${inSignage}${data_in_type.referencedType.typeWidth}_${outSignage}${data_out_type.referencedType.typeWidth}(${data_in}, ${size}, ${mul_intimmediate}, ${add_intimmediate}, ${data_out}, ${log2Dstring}, ${channel_width}, 0, 0 , ${output_min}, ${output_max}, 1);
 """)
diff --git a/DeeployTest/Platforms/Generic/main.c b/DeeployTest/Platforms/Generic/main.c
@@ -50,20 +50,21 @@ int main() {
 
   int32_t tot_err = 0;
   uint32_t tot = 0;
-  int32_t diff;
-  int32_t expected, actual;
+  float32_t diff;
+  float32_t expected, actual;
   for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
-    tot += DeeployNetwork_outputs_bytes[buf];
-    for (uint32_t i = 0; i < DeeployNetwork_outputs_bytes[buf]; i++) {
-      expected = ((char *)testOutputVector[buf])[i];
-      actual = ((char *)DeeployNetwork_outputs[buf])[i];
+    tot += DeeployNetwork_outputs_bytes[buf] / sizeof(float32_t);
+    for (uint32_t i = 0;
+         i < DeeployNetwork_outputs_bytes[buf] / sizeof(float32_t); i++) {
+      expected = ((float32_t *)testOutputVector[buf])[i];
+      actual = ((float32_t *)DeeployNetwork_outputs[buf])[i];
       diff = expected - actual;
 
-      if (diff) {
+      if ((diff < 0 ? -diff : diff) > 1e-5) {
         tot_err += 1;
-        printf("Expected: %4d  ", expected);
-        printf("Actual: %4d  ", actual);
-        printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf);
+        printf("Expected: %10.6f  ", expected);
+        printf("Actual: %10.6f  ", actual);
+        printf("Diff: %10.6f at Index %12u in Output %u\r\n", diff, i, buf);
       }
     }
   }

diff --git a/DeeployTest/Tests/testFloatGEMM/inputs.npz b/DeeployTest/Tests/testFloatGEMM/inputs.npz
diff --git a/DeeployTest/Tests/testFloatGEMM/network.onnx b/DeeployTest/Tests/testFloatGEMM/network.onnx
diff --git a/DeeployTest/Tests/testFloatGEMM/outputs.npz b/DeeployTest/Tests/testFloatGEMM/outputs.npz