From 2ec61a0234fad9496c3f1776204188e1f609f32b Mon Sep 17 00:00:00 2001
From: francovaro <francovaro@gmail.com>
Date: Fri, 8 Mar 2024 09:15:35 +0100
Subject: [PATCH] SDK release 1.47.0

---
 EdgeImpulse.EI-SDK.pdsc                       |  12 +-
 EdgeImpulse.pidx                              |   4 +-
 .../classifier/ei_fill_result_struct.h        | 499 +++++++++---------
 .../classifier/ei_model_types.h               |  68 +++
 .../edge-impulse-sdk/classifier/ei_nms.h      | 134 +++--
 .../classifier/ei_run_classifier.h            | 133 +++--
 .../edge-impulse-sdk/classifier/ei_run_dsp.h  | 109 +---
 .../edge-impulse-sdk/dsp/ei_dsp_handle.h      |  58 ++
 edgeimpulse/edge-impulse-sdk/dsp/ei_flatten.h | 198 +++++++
 .../edge-impulse-sdk/dsp/numpy_types.h        |   4 +-
 .../edge-impulse-sdk/dsp/returntypes.h        |  35 ++
 .../edge-impulse-sdk/dsp/returntypes.hpp      |   5 +-
 .../dsp/speechpy/processing.hpp               |   2 +-
 .../porting/ei_classifier_porting.h           |  29 +-
 .../edge-impulse-sdk/porting/ei_logging.h     |   6 +-
 .../tensorflow/lite/micro/kernels/softmax.cpp |   2 +-
 .../tensorflow/lite/micro/micro_allocator.cpp |   2 +-
 17 files changed, 825 insertions(+), 475 deletions(-)
 create mode 100644 edgeimpulse/edge-impulse-sdk/dsp/ei_dsp_handle.h
 create mode 100644 edgeimpulse/edge-impulse-sdk/dsp/ei_flatten.h
 create mode 100644 edgeimpulse/edge-impulse-sdk/dsp/returntypes.h
diff --git a/EdgeImpulse.EI-SDK.pdsc b/EdgeImpulse.EI-SDK.pdsc
index cbe65bc..aa1a669 100644
--- a/EdgeImpulse.EI-SDK.pdsc
+++ b/EdgeImpulse.EI-SDK.pdsc
@@ -5,13 +5,16 @@
 	<name>EI-SDK</name>
 	<license>LICENSE-apache-2.0.txt</license>
 	<description>Edge Impulse SDK</description>
-	<url>https://github.com/edgeimpulse/edge-impulse-sdk-pack/releases/download/v1.46.6/</url>
+	<url>https://github.com/edgeimpulse/edge-impulse-sdk-pack/releases/download/v1.47.0/</url>
 	<supportContact>hello@edgeimpulse.com</supportContact>
 	<repository type="git">https://github.com/edgeimpulse/edge-impulse-sdk-pack.git</repository>
 	<releases>
-		<release version="1.46.6" tag="v1.46.6" date="2024-03-06" url="https://github.com/edgeimpulse/edge-impulse-sdk-pack/releases/download/v1.46.6/EdgeImpulse.EI-SDK.1.46.6.pack">
+		<release version="1.47.0" tag="v1.47.0" date="2024-03-08" url="https://github.com/edgeimpulse/edge-impulse-sdk-pack/releases/download/v1.47.0/EdgeImpulse.EI-SDK.1.47.0.pack">
       		EI-SDK
     	</release>	
+		<release version="1.46.6" tag="v1.46.6" date="2024-03-06" url="https://github.com/edgeimpulse/edge-impulse-sdk-pack/releases/download/v1.46.6/EdgeImpulse.EI-SDK.1.46.6.pack">
+      		EI-SDK
+    	</release>
 		<release version="1.46.4" tag="v1.46.4" date="2024-03-05" url="https://github.com/edgeimpulse/edge-impulse-sdk-pack/releases/download/v1.46.4/EdgeImpulse.EI-SDK.1.46.4.pack">
       		EI-SDK
     	</release>
@@ -77,7 +80,7 @@
     </packages>
     </requirements>
 	<components>
-		<component Cclass="EdgeImpulse" Cgroup="SDK" Cversion="1.46.6">
+		<component Cclass="EdgeImpulse" Cgroup="SDK" Cversion="1.47.0">
 			<description>Edge Impulse SDK</description>	
 				<!-- short component description -->
 				<files>
@@ -535,11 +538,14 @@
 					<file category="header" name="edgeimpulse/edge-impulse-sdk/tensorflow/lite/c/c_api_types.h"/>
 					<file category="header" name="edgeimpulse/edge-impulse-sdk/tensorflow/lite/c/builtin_op_data.h"/>
 					<file category="header" name="edgeimpulse/edge-impulse-sdk/tensorflow/lite/c/common.h"/>
+					<file category="header" name="edgeimpulse/edge-impulse-sdk/dsp/ei_flatten.h"/>
+					<file category="header" name="edgeimpulse/edge-impulse-sdk/dsp/ei_dsp_handle.h"/>
 					<file category="header" name="edgeimpulse/edge-impulse-sdk/dsp/numpy_types.h"/>
 					<file category="header" name="edgeimpulse/edge-impulse-sdk/dsp/ei_profiler.h"/>
 					<file category="header" name="edgeimpulse/edge-impulse-sdk/dsp/ei_utils.h"/>
 					<file category="header" name="edgeimpulse/edge-impulse-sdk/dsp/ei_alloc.h"/>
 					<file category="header" name="edgeimpulse/edge-impulse-sdk/dsp/ei_vector.h"/>
+					<file category="header" name="edgeimpulse/edge-impulse-sdk/dsp/returntypes.h"/>
 					<file category="header" name="edgeimpulse/edge-impulse-sdk/dsp/kissfft/_kiss_fft_guts.h"/>
 					<file category="header" name="edgeimpulse/edge-impulse-sdk/dsp/kissfft/kiss_fft.h"/>
 					<file category="header" name="edgeimpulse/edge-impulse-sdk/dsp/kissfft/kissfft.h"/>
diff --git a/EdgeImpulse.pidx b/EdgeImpulse.pidx
index 455dbc3..a8f7a49 100644
--- a/EdgeImpulse.pidx
+++ b/EdgeImpulse.pidx
@@ -2,8 +2,8 @@
 <index schemaVersion="1.0.0" xs:noNamespaceSchemaLocation="PackIndex.xsd" xmlns:xs="http://www.w3.org/2001/XMLSchema-instance">
   <vendor>EdgeImpulse</vendor>
   <url>https://raw.githubusercontent.com/edgeimpulse/edge-impulse-sdk-pack/main/</url>
-  <timestamp>2024-03-06 17:40:41</timestamp>
+  <timestamp>2024-03-08 09:13:21</timestamp>
   <pindex>
-    <pdsc url="https://github.com/edgeimpulse/edge-impulse-sdk-pack/releases/download/v1.46.6/" vendor="EdgeImpulse" name="EI-SDK" version="1.46.6"/>
+    <pdsc url="https://github.com/edgeimpulse/edge-impulse-sdk-pack/releases/download/v1.47.0/" vendor="EdgeImpulse" name="EI-SDK" version="1.47.0"/>
   </pindex>
 </index>
diff --git a/edgeimpulse/edge-impulse-sdk/classifier/ei_fill_result_struct.h b/edgeimpulse/edge-impulse-sdk/classifier/ei_fill_result_struct.h
index e231aa8..e19bf58 100644
--- a/edgeimpulse/edge-impulse-sdk/classifier/ei_fill_result_struct.h
+++ b/edgeimpulse/edge-impulse-sdk/classifier/ei_fill_result_struct.h
@@ -991,112 +991,106 @@ __attribute__((unused)) static EI_IMPULSE_ERROR fill_result_struct_tao_decode_de
     size_t row_count = output_features_count / col_size;
 
     for (size_t cls_idx = 1; cls_idx < (size_t)(impulse->label_count + 1); cls_idx++)  {
+
+        // create boxes, scores and labels structures for nms
+        matrix_t boxes(row_count, 4);
+        matrix_t scores(row_count, 1);
+        matrix_i32_t classes(row_count, 1);
+
         for (size_t ix = 0; ix < row_count; ix++) {
 
             float score = (static_cast<float>(data[ix * col_size + cls_idx]) - zero_point) * scale;
             score = clip_val(score, 0.0f, 1.0f);
 
-            if (score >= impulse->object_detection_threshold && score <= 1.0f) {
-
-                // # 1. calculate boxes location
-                size_t base_ix = ix * col_size + col_size; // references the end of the row
-
-                float r_12 = (static_cast<float>(data[base_ix - 12]) - zero_point) * scale;
-                float r_11 = (static_cast<float>(data[base_ix - 11]) - zero_point) * scale;
-                float r_10 = (static_cast<float>(data[base_ix - 10]) - zero_point) * scale;
-                float r_9  = (static_cast<float>(data[base_ix -  9]) - zero_point) * scale;
-                float r_8  = (static_cast<float>(data[base_ix -  8]) - zero_point) * scale;
-                float r_7  = (static_cast<float>(data[base_ix -  7]) - zero_point) * scale;
-                float r_6  = (static_cast<float>(data[base_ix -  6]) - zero_point) * scale;
-                float r_5  = (static_cast<float>(data[base_ix -  5]) - zero_point) * scale;
-                float r_4  = (static_cast<float>(data[base_ix -  4]) - zero_point) * scale;
-                float r_3  = (static_cast<float>(data[base_ix -  3]) - zero_point) * scale;
-                float r_2  = (static_cast<float>(data[base_ix -  2]) - zero_point) * scale;
-                float r_1  = (static_cast<float>(data[base_ix -  1]) - zero_point) * scale;
-
-                // cx_pred = y_pred[..., -12]
-                // cy_pred = y_pred[..., -11]
-                // w_pred = y_pred[..., -10]
-                // h_pred = y_pred[..., -9]
-                float cx_pred = r_12;
-                float cy_pred = r_11;
-                float w_pred  = r_10;
-                float h_pred  = r_9;
-
-                // w_anchor = y_pred[..., -6] - y_pred[..., -8]
-                // h_anchor = y_pred[..., -5] - y_pred[..., -7]
-                float w_anchor = r_6 - r_8;
-                float h_anchor = r_5 - r_7;
-
-                // cx_anchor = tf.truediv(y_pred[..., -6] + y_pred[..., -8], 2.0)
-                // cy_anchor = tf.truediv(y_pred[..., -5] + y_pred[..., -7], 2.0)
-                float cx_anchor = (r_6 + r_8) / 2.0f;
-                float cy_anchor = (r_5 + r_7) / 2.0f;
-
-                // cx_variance = y_pred[..., -4]
-                // cy_variance = y_pred[..., -3]
-                float cx_variance = r_4;
-                float cy_variance = r_3;
-
-                // variance_w = y_pred[..., -2]
-                // variance_h = y_pred[..., -1]
-                float variance_w = r_2;
-                float variance_h = r_1;
-
-                // # Convert anchor box offsets to image offsets.
-                // cx = cx_pred * cx_variance * w_anchor + cx_anchor
-                // cy = cy_pred * cy_variance * h_anchor + cy_anchor
-                // w = tf.exp(w_pred * variance_w) * w_anchor
-                // h = tf.exp(h_pred * variance_h) * h_anchor
-                float cx = cx_pred * cx_variance * w_anchor + cx_anchor;
-                float cy = cy_pred * cy_variance * h_anchor + cy_anchor;
-                float w = exp(w_pred * variance_w) * w_anchor;
-                float h = exp(h_pred * variance_h) * h_anchor;
-
-                // # Convert 'centroids' to 'corners'.
-                float xmin = cx - (w / 2.0f);
-                float ymin = cy - (h / 2.0f);
-                float xmax = cx + (w / 2.0f);
-                float ymax = cy + (h / 2.0f);
-
-                xmin = xmin * impulse->input_width;
-                ymin = ymin * impulse->input_height;
-                xmax = xmax * impulse->input_width;
-                ymax = ymax * impulse->input_height;
-
-                xmin = clip_val(xmin, 0.0f, (float)impulse->input_width);
-                ymin = clip_val(ymin, 0.0f, (float)impulse->input_height);
-                xmax = clip_val(xmax, 0.0f, (float)impulse->input_width);
-                ymax = clip_val(ymax, 0.0f, (float)impulse->input_height);
-
-                float w0 = xmax - xmin;
-                float h0 = ymax - ymin;
-
-                // will be round to 0
-                if (w0 <= 0 || h0 <= 0) {
-                    continue;
-                }
-
-                ei_impulse_result_bounding_box_t r;
-                // note indexing
-                r.label = ei_classifier_inferencing_categories[cls_idx - 1];
-
-                r.x = static_cast<uint32_t>(xmin);
-                r.y = static_cast<uint32_t>(ymin);
-                r.width = static_cast<uint32_t>(w0);
-                r.height = static_cast<uint32_t>(h0);
-                r.value = score;
-                dec_results.push_back(r);
-            }
+            // # 1. calculate boxes location
+            size_t base_ix = ix * col_size + col_size; // references the end of the row
+
+            float r_12 = (static_cast<float>(data[base_ix - 12]) - zero_point) * scale;
+            float r_11 = (static_cast<float>(data[base_ix - 11]) - zero_point) * scale;
+            float r_10 = (static_cast<float>(data[base_ix - 10]) - zero_point) * scale;
+            float r_9  = (static_cast<float>(data[base_ix -  9]) - zero_point) * scale;
+            float r_8  = (static_cast<float>(data[base_ix -  8]) - zero_point) * scale;
+            float r_7  = (static_cast<float>(data[base_ix -  7]) - zero_point) * scale;
+            float r_6  = (static_cast<float>(data[base_ix -  6]) - zero_point) * scale;
+            float r_5  = (static_cast<float>(data[base_ix -  5]) - zero_point) * scale;
+            float r_4  = (static_cast<float>(data[base_ix -  4]) - zero_point) * scale;
+            float r_3  = (static_cast<float>(data[base_ix -  3]) - zero_point) * scale;
+            float r_2  = (static_cast<float>(data[base_ix -  2]) - zero_point) * scale;
+            float r_1  = (static_cast<float>(data[base_ix -  1]) - zero_point) * scale;
+
+            // cx_pred = y_pred[..., -12]
+            // cy_pred = y_pred[..., -11]
+            // w_pred = y_pred[..., -10]
+            // h_pred = y_pred[..., -9]
+            float cx_pred = r_12;
+            float cy_pred = r_11;
+            float w_pred  = r_10;
+            float h_pred  = r_9;
+
+            // w_anchor = y_pred[..., -6] - y_pred[..., -8]
+            // h_anchor = y_pred[..., -5] - y_pred[..., -7]
+            float w_anchor = r_6 - r_8;
+            float h_anchor = r_5 - r_7;
+
+            // cx_anchor = tf.truediv(y_pred[..., -6] + y_pred[..., -8], 2.0)
+            // cy_anchor = tf.truediv(y_pred[..., -5] + y_pred[..., -7], 2.0)
+            float cx_anchor = (r_6 + r_8) / 2.0f;
+            float cy_anchor = (r_5 + r_7) / 2.0f;
+
+            // cx_variance = y_pred[..., -4]
+            // cy_variance = y_pred[..., -3]
+            float cx_variance = r_4;
+            float cy_variance = r_3;
+
+            // variance_w = y_pred[..., -2]
+            // variance_h = y_pred[..., -1]
+            float variance_w = r_2;
+            float variance_h = r_1;
+
+            // # Convert anchor box offsets to image offsets.
+            // cx = cx_pred * cx_variance * w_anchor + cx_anchor
+            // cy = cy_pred * cy_variance * h_anchor + cy_anchor
+            // w = tf.exp(w_pred * variance_w) * w_anchor
+            // h = tf.exp(h_pred * variance_h) * h_anchor
+            float cx = cx_pred * cx_variance * w_anchor + cx_anchor;
+            float cy = cy_pred * cy_variance * h_anchor + cy_anchor;
+            float w = exp(w_pred * variance_w) * w_anchor;
+            float h = exp(h_pred * variance_h) * h_anchor;
+
+            // # Convert 'centroids' to 'corners'.
+            float xmin = cx - (w / 2.0f);
+            float ymin = cy - (h / 2.0f);
+            float xmax = cx + (w / 2.0f);
+            float ymax = cy + (h / 2.0f);
+
+            xmin *= impulse->input_width;
+            ymin *= impulse->input_height;
+            xmax *= impulse->input_width;
+            ymax *= impulse->input_height;
+
+            // note nms requires [ymin, xmin, ymax, xmax]
+            boxes.buffer[(ix * boxes.cols) + 0] = ymin;
+            boxes.buffer[(ix * boxes.cols) + 1] = xmin;
+            boxes.buffer[(ix * boxes.cols) + 2] = ymax;
+            boxes.buffer[(ix * boxes.cols) + 3] = xmax;
+
+            classes.buffer[ix] = cls_idx-1;
+            scores.buffer[ix] = score;
         }
 
-        EI_IMPULSE_ERROR nms_res = ei_run_nms(impulse, &dec_results);
+        EI_IMPULSE_ERROR nms_res = ei_run_nms(impulse, &dec_results,
+                                              boxes.buffer, scores.buffer,
+                                              classes.buffer, row_count,
+                                              false /*clip_boxes*/);
         if (nms_res != EI_IMPULSE_OK) {
             return nms_res;
         }
 
         for (size_t j = 0; j < dec_results.size(); j++) {
-            results.push_back(dec_results[j]);
+            auto bb = dec_results[j];
+            if (bb.value >= impulse->object_detection_threshold) {
+                results.push_back(bb);
+            }
         }
 
         dec_results.clear();
@@ -1112,17 +1106,17 @@ __attribute__((unused)) static EI_IMPULSE_ERROR fill_result_struct_tao_decode_de
         }
     }
 
-    // keep topK
-    if (results.size() > 200) {
-        results.erase(results.begin() + 200, results.end());
-    }
-
-    // sort in reverse order
+    // we sort in reverse order accross all classes,
+    // since results for each class are pushed to the end.
     std::sort(results.begin(), results.end(), [ ]( const ei_impulse_result_bounding_box_t& lhs, const ei_impulse_result_bounding_box_t& rhs )
     {
         return lhs.value > rhs.value;
     });
 
+    // keep topK
+    if (results.size() > 200) {
+        results.erase(results.begin() + 200, results.end());
+    }
 
     result->bounding_boxes = results.data();
     result->bounding_boxes_count = results.size();
@@ -1177,7 +1171,9 @@ template<typename T>
 __attribute__((unused)) static void fill_result_struct_tao_yolov3_common(const ei_impulse_t *impulse,
                                                                          ei_impulse_result_t *result,
                                                                          T *data,
-                                                                         matrix_t *output,
+                                                                         matrix_t *boxes,
+                                                                         matrix_t *scores,
+                                                                         matrix_i32_t *classes,
                                                                          float zero_point,
                                                                          float scale,
                                                                          size_t output_features_count) {
@@ -1186,36 +1182,50 @@ __attribute__((unused)) static void fill_result_struct_tao_yolov3_common(const e
     size_t col_size = 11 + impulse->label_count;
     size_t row_count = output_features_count / col_size;
 
-    for (size_t ix = 0; ix < row_count; ix++) {
-
-        size_t data_ix = ix * col_size;
-        float r_0  = (static_cast<float>(data[data_ix +  0]) - zero_point) * scale;
-        float r_1  = (static_cast<float>(data[data_ix +  1]) - zero_point) * scale;
-        float r_2  = (static_cast<float>(data[data_ix +  2]) - zero_point) * scale;
-        float r_3  = (static_cast<float>(data[data_ix +  3]) - zero_point) * scale;
-        float r_4  = (static_cast<float>(data[data_ix +  4]) - zero_point) * scale;
-        float r_5  = (static_cast<float>(data[data_ix +  5]) - zero_point) * scale;
-        float r_6  = (static_cast<float>(data[data_ix +  6]) - zero_point) * scale;
-        float r_7  = (static_cast<float>(data[data_ix +  7]) - zero_point) * scale;
-        float r_8  = (static_cast<float>(data[data_ix +  8]) - zero_point) * scale;
-        float r_9  = (static_cast<float>(data[data_ix +  9]) - zero_point) * scale;
-        float r_10 = (static_cast<float>(data[data_ix + 10]) - zero_point) * scale;
-
-        float by = r_0 + sigmoid(r_6) * r_4;
-        float bx = r_1 + sigmoid(r_7) * r_5;
-        float bh = r_2 * exp(r_8);
-        float bw = r_3 * exp(r_9);
-
-        size_t box_ix = ix * output->cols;
-        output->buffer[box_ix + 0] = bx - 0.5 * bw; // xmin
-        output->buffer[box_ix + 1] = by - 0.5 * bh; // ymin
-        output->buffer[box_ix + 2] = bx + 0.5 * bw; // xmax
-        output->buffer[box_ix + 3] = by + 0.5 * bh; // ymax
-
-        // add class scores
-        for (size_t cls_idx = 0; cls_idx < impulse->label_count; cls_idx++) {
+    for (size_t cls_idx = 0; cls_idx < (size_t)impulse->label_count; cls_idx++)  {
+        for (size_t ix = 0; ix < row_count; ix++) {
+            size_t data_ix = ix * col_size;
+            float r_0  = (static_cast<float>(data[data_ix +  0]) - zero_point) * scale;
+            float r_1  = (static_cast<float>(data[data_ix +  1]) - zero_point) * scale;
+            float r_2  = (static_cast<float>(data[data_ix +  2]) - zero_point) * scale;
+            float r_3  = (static_cast<float>(data[data_ix +  3]) - zero_point) * scale;
+            float r_4  = (static_cast<float>(data[data_ix +  4]) - zero_point) * scale;
+            float r_5  = (static_cast<float>(data[data_ix +  5]) - zero_point) * scale;
+            float r_6  = (static_cast<float>(data[data_ix +  6]) - zero_point) * scale;
+            float r_7  = (static_cast<float>(data[data_ix +  7]) - zero_point) * scale;
+            float r_8  = (static_cast<float>(data[data_ix +  8]) - zero_point) * scale;
+            float r_9  = (static_cast<float>(data[data_ix +  9]) - zero_point) * scale;
+            float r_10 = (static_cast<float>(data[data_ix + 10]) - zero_point) * scale;
             float cls = (static_cast<float>(data[data_ix + 11 + cls_idx]) - zero_point) * scale;
-            output->buffer[box_ix + 4 + cls_idx] = sigmoid(cls) * sigmoid(r_10);
+
+            float by = r_0 + sigmoid(r_6) * r_4;
+            float bx = r_1 + sigmoid(r_7) * r_5;
+            float bh = r_2 * exp(r_8);
+            float bw = r_3 * exp(r_9);
+
+            size_t box_ix = boxes->cols * ((cls_idx * row_count) + ix);
+            size_t class_ix = classes->cols * ((cls_idx * row_count) + ix);
+            size_t score_ix = scores->cols * ((cls_idx * row_count) + ix);
+
+            float ymin = by - 0.5 * bh;
+            float xmin = bx - 0.5 * bw;
+            float ymax = by + 0.5 * bh;
+            float xmax = bx + 0.5 * bw;
+
+            // from relative to absolute
+            ymin *= impulse->input_height;
+            xmin *= impulse->input_width;
+            ymax *= impulse->input_height;
+            xmax *= impulse->input_width;
+
+            // [ymin, xmin, ymax, xmax]
+            boxes->buffer[box_ix + 0] = ymin;
+            boxes->buffer[box_ix + 1] = xmin;
+            boxes->buffer[box_ix + 2] = ymax;
+            boxes->buffer[box_ix + 3] = xmax;
+
+            classes->buffer[class_ix] = cls_idx;
+            scores->buffer[score_ix] = sigmoid(cls) * sigmoid(r_10);
         }
     }
 }
@@ -1229,7 +1239,9 @@ template<typename T>
 __attribute__((unused)) static void fill_result_struct_tao_yolov4_common(const ei_impulse_t *impulse,
                                                                          ei_impulse_result_t *result,
                                                                          T *data,
-                                                                         matrix_t *output,
+                                                                         matrix_t *boxes,
+                                                                         matrix_t *scores,
+                                                                         matrix_i32_t *classes,
                                                                          float zero_point,
                                                                          float scale,
                                                                          size_t output_features_count) {
@@ -1239,44 +1251,61 @@ __attribute__((unused)) static void fill_result_struct_tao_yolov4_common(const e
     size_t row_count = output_features_count / col_size;
     const float grid_scale_xy = 1.0f;
 
-    for (size_t ix = 0; ix < row_count; ix++) {
+    for (size_t cls_idx = 0; cls_idx < (size_t)impulse->label_count; cls_idx++)  {
+        for (size_t ix = 0; ix < row_count; ix++) {
+
+            float r_0  = (static_cast<float>(data[ix * col_size +  0]) - zero_point) * scale;
+            float r_1  = (static_cast<float>(data[ix * col_size +  1]) - zero_point) * scale;
+            float r_2  = (static_cast<float>(data[ix * col_size +  2]) - zero_point) * scale;
+            float r_3  = (static_cast<float>(data[ix * col_size +  3]) - zero_point) * scale;
+            float r_4  = (static_cast<float>(data[ix * col_size +  4]) - zero_point) * scale;
+            float r_5  = (static_cast<float>(data[ix * col_size +  5]) - zero_point) * scale;
+            float r_6  = (static_cast<float>(data[ix * col_size +  6]) - zero_point) * scale;
+            float r_7  = (static_cast<float>(data[ix * col_size +  7]) - zero_point) * scale;
+            float r_8  = (static_cast<float>(data[ix * col_size +  8]) - zero_point) * scale;
+            float r_9  = (static_cast<float>(data[ix * col_size +  9]) - zero_point) * scale;
+            float r_10 = (static_cast<float>(data[ix * col_size + 10]) - zero_point) * scale;
+
+            float pred_y = sigmoid(r_6) * grid_scale_xy - (grid_scale_xy - 1.0f) / 2.0f;
+            float pred_x = sigmoid(r_7) * grid_scale_xy - (grid_scale_xy - 1.0f) / 2.0f;
+            float pred_h = exp(std::min(r_8, 8.0f));
+            float pred_w = exp(std::min(r_9, 8.0f));
+
+            r_6 = pred_y;
+            r_7 = pred_x;
+            r_8 = pred_h;
+            r_9 = pred_w;
+
+            float by = r_0 + r_6 * r_4;
+            float bx = r_1 + r_7 * r_5;
+            float bh = r_2 * r_8;
+            float bw = r_3 * r_9;
+
+            size_t box_ix = boxes->cols * ((cls_idx * row_count) + ix);
+            size_t class_ix = classes->cols * ((cls_idx * row_count) + ix);
+            size_t score_ix = scores->cols * ((cls_idx * row_count) + ix);
+
+            float ymin = by - 0.5 * bh;
+            float xmin = bx - 0.5 * bw;
+            float ymax = by + 0.5 * bh;
+            float xmax = bx + 0.5 * bw;
+
+            // from relative to absolute
+            ymin *= impulse->input_height;
+            xmin *= impulse->input_width;
+            ymax *= impulse->input_height;
+            xmax *= impulse->input_width;
+
+            // [ymin, xmin, ymax, xmax]
+            boxes->buffer[box_ix + 0] = ymin;
+            boxes->buffer[box_ix + 1] = xmin;
+            boxes->buffer[box_ix + 2] = ymax;
+            boxes->buffer[box_ix + 3] = xmax;
+
+            classes->buffer[class_ix] = cls_idx;
 
-        float r_0  = (static_cast<float>(data[ix * col_size +  0]) - zero_point) * scale;
-        float r_1  = (static_cast<float>(data[ix * col_size +  1]) - zero_point) * scale;
-        float r_2  = (static_cast<float>(data[ix * col_size +  2]) - zero_point) * scale;
-        float r_3  = (static_cast<float>(data[ix * col_size +  3]) - zero_point) * scale;
-        float r_4  = (static_cast<float>(data[ix * col_size +  4]) - zero_point) * scale;
-        float r_5  = (static_cast<float>(data[ix * col_size +  5]) - zero_point) * scale;
-        float r_6  = (static_cast<float>(data[ix * col_size +  6]) - zero_point) * scale;
-        float r_7  = (static_cast<float>(data[ix * col_size +  7]) - zero_point) * scale;
-        float r_8  = (static_cast<float>(data[ix * col_size +  8]) - zero_point) * scale;
-        float r_9  = (static_cast<float>(data[ix * col_size +  9]) - zero_point) * scale;
-        float r_10 = (static_cast<float>(data[ix * col_size + 10]) - zero_point) * scale;
-
-        float pred_y = sigmoid(r_6) * grid_scale_xy - (grid_scale_xy - 1.0f) / 2.0f;
-        float pred_x = sigmoid(r_7) * grid_scale_xy - (grid_scale_xy - 1.0f) / 2.0f;
-        float pred_h = exp(std::min(r_8, 8.0f));
-        float pred_w = exp(std::min(r_9, 8.0f));
-
-        r_6 = pred_y;
-        r_7 = pred_x;
-        r_8 = pred_h;
-        r_9 = pred_w;
-
-        float by = r_0 + r_6 * r_4;
-        float bx = r_1 + r_7 * r_5;
-        float bh = r_2 * r_8;
-        float bw = r_3 * r_9;
-
-        output->buffer[ix * output->cols + 0] = bx - 0.5 * bw; // xmin
-        output->buffer[ix * output->cols + 1] = by - 0.5 * bh; // ymin
-        output->buffer[ix * output->cols + 2] = bx + 0.5 * bw; // xmax
-        output->buffer[ix * output->cols + 3] = by + 0.5 * bh; // ymax
-
-        // add class scores
-        for (size_t cls_idx = 0; cls_idx < impulse->label_count; cls_idx++) {
             float cls = (static_cast<float>(data[ix * col_size + 11 + cls_idx]) - zero_point) * scale;
-            output->buffer[ix * output->cols + 4 + cls_idx] = sigmoid(cls) * sigmoid(r_10);
+            scores->buffer[score_ix] = sigmoid(cls) * sigmoid(r_10);
         }
     }
 }
@@ -1286,72 +1315,37 @@ __attribute__((unused)) static void fill_result_struct_tao_yolov4_common(const e
 /**
  * Fill the result structure from an output tensor
 */
-template<typename T>
 __attribute__((unused)) static EI_IMPULSE_ERROR fill_result_struct_tao_yolo_common(const ei_impulse_t *impulse,
                                                                                    ei_impulse_result_t *result,
-                                                                                   T *data,
-                                                                                   float zero_point,
-                                                                                   float scale,
-                                                                                   size_t output_features_count) {
+                                                                                   matrix_t *inp_boxes,
+                                                                                   matrix_t *inp_scores,
+                                                                                   matrix_i32_t *inp_classes,
+                                                                                   size_t nboxes) {
     static std::vector<ei_impulse_result_bounding_box_t> results;
     static std::vector<ei_impulse_result_bounding_box_t> dec_results;
     results.clear();
     dec_results.clear();
 
-    // # x: 3-D tensor. Last dimension is (x_min, y_min, x_max, y_max, cls_confidence[0, 1, ...])
-    size_t col_size = 4 + impulse->label_count;
-    size_t row_count = output_features_count / col_size;
-
     for (size_t cls_idx = 0; cls_idx < impulse->label_count; cls_idx++)  {
-        for (size_t ix = 0; ix < row_count; ix++) {
-
-            size_t base_ix = ix * col_size;
-            float score = static_cast<float>(data[base_ix + 4 + cls_idx] - zero_point) * scale;
-            score = clip_val(score, 0.0f, 1.0f);
-
-            if (score >= impulse->object_detection_threshold && score <= 1.0f) {
 
-                float xmin = static_cast<float>(data[base_ix + 0] - zero_point) * scale;
-                float ymin = static_cast<float>(data[base_ix + 1] - zero_point) * scale;
-                float xmax = static_cast<float>(data[base_ix + 2] - zero_point) * scale;
-                float ymax = static_cast<float>(data[base_ix + 3] - zero_point) * scale;
+        // create boxes, scores and labels structures for nms
+        matrix_t boxes(nboxes, 4, inp_boxes->buffer + (cls_idx * nboxes * 4));
+        matrix_t scores(nboxes, 1, inp_scores->buffer + (cls_idx * nboxes * 1));
+        matrix_i32_t classes(nboxes, 1, inp_classes->buffer + (cls_idx * nboxes * 1));
 
-                xmin = xmin * impulse->input_width;
-                ymin = ymin * impulse->input_height;
-                xmax = xmax * impulse->input_width;
-                ymax = ymax * impulse->input_height;
-
-                xmin = clip_val(xmin, 0.0f, (float)impulse->input_width);
-                ymin = clip_val(ymin, 0.0f, (float)impulse->input_height);
-                xmax = clip_val(xmax, 0.0f, (float)impulse->input_width);
-                ymax = clip_val(ymax, 0.0f, (float)impulse->input_height);
-
-                float w0 = xmax - xmin;
-                float h0 = ymax - ymin;
-
-                if (w0 <= 0 || h0 <= 0) {
-                    continue;
-                }
-
-                ei_impulse_result_bounding_box_t r;
-                r.label = ei_classifier_inferencing_categories[cls_idx];
-
-                r.x = static_cast<uint32_t>(xmin);
-                r.y = static_cast<uint32_t>(ymin);
-                r.width = static_cast<uint32_t>(w0);
-                r.height = static_cast<uint32_t>(h0);
-                r.value = score;
-                dec_results.push_back(r);
-            }
-        }
-
-        EI_IMPULSE_ERROR nms_res = ei_run_nms(impulse, &dec_results);
+        EI_IMPULSE_ERROR nms_res = ei_run_nms(impulse, &dec_results,
+                                              boxes.buffer, scores.buffer,
+                                              classes.buffer, nboxes,
+                                              true /*clip_boxes*/);
         if (nms_res != EI_IMPULSE_OK) {
             return nms_res;
         }
 
         for (size_t j = 0; j < dec_results.size(); j++) {
-            results.push_back(dec_results[j]);
+            auto bb = dec_results[j];
+            if (bb.value >= impulse->object_detection_threshold) {
+                results.push_back(bb);
+            }
         }
 
         dec_results.clear();
@@ -1367,17 +1361,17 @@ __attribute__((unused)) static EI_IMPULSE_ERROR fill_result_struct_tao_yolo_comm
         }
     }
 
-    // keep topK
-    if (results.size() > 200) {
-        results.erase(results.begin() + 200, results.end());
-    }
-
-    // sort in reverse order
+    // we sort in reverse order accross all classes,
+    // since results for each class are pushed to the end.
     std::sort(results.begin(), results.end(), [ ]( const ei_impulse_result_bounding_box_t& lhs, const ei_impulse_result_bounding_box_t& rhs )
     {
         return lhs.value > rhs.value;
     });
 
+    // keep topK
+    if (results.size() > 200) {
+        results.erase(results.begin() + 200, results.end());
+    }
 
     result->bounding_boxes = results.data();
     result->bounding_boxes_count = results.size();
@@ -1396,11 +1390,14 @@ __attribute__((unused)) static EI_IMPULSE_ERROR fill_result_struct_f32_tao_yolov
 #ifdef EI_HAS_TAO_YOLOV3
 
     size_t col_size = 11 + impulse->label_count;
-    size_t row_count = output_features_count / col_size;
-
-    matrix_t boxes(row_count, 4 + impulse->label_count);
-    fill_result_struct_tao_yolov3_common(impulse, result, data, &boxes, 0.0f, 1.0f, output_features_count);
-    return fill_result_struct_tao_yolo_common(impulse, result, boxes.buffer, 0.0f, 1.0f, boxes.rows * boxes.cols);
+    size_t nboxes = output_features_count / col_size;
+
+    // (classes, nboxes, ...)
+    matrix_t boxes_y(nboxes * impulse->label_count, 4);
+    matrix_t scores(nboxes * impulse->label_count, 1);
+    matrix_i32_t classes(nboxes * impulse->label_count, 1);
+    fill_result_struct_tao_yolov3_common(impulse, result, data, &boxes_y, &scores, &classes, 0.0f, 1.0f, output_features_count);
+    return fill_result_struct_tao_yolo_common(impulse, result, &boxes_y, &scores, &classes, nboxes);
 #else
     return EI_IMPULSE_LAST_LAYER_NOT_AVAILABLE;
 #endif // #ifdef EI_HAS_TAO_YOLOV3
@@ -1417,12 +1414,16 @@ __attribute__((unused)) static EI_IMPULSE_ERROR fill_result_struct_quantized_tao
                                                                                       float scale,
                                                                                       size_t output_features_count) {
 #ifdef EI_HAS_TAO_YOLOV3
-    size_t col_size = 11 + impulse->label_count;
-    size_t row_count = output_features_count / col_size;
 
-    matrix_t boxes(row_count, 4 + impulse->label_count);
-    fill_result_struct_tao_yolov3_common(impulse, result, data, &boxes, zero_point, scale, output_features_count);
-    return fill_result_struct_tao_yolo_common(impulse, result, boxes.buffer, 0.0f, 1.0f, boxes.rows * boxes.cols);
+    size_t col_size = 11 + impulse->label_count;
+    size_t nboxes = output_features_count / col_size;
+
+    // (classes, nboxes, ...)
+    matrix_t boxes_y(nboxes * impulse->label_count, 4);
+    matrix_t scores(nboxes * impulse->label_count, 1);
+    matrix_i32_t classes(nboxes * impulse->label_count, 1);
+    fill_result_struct_tao_yolov3_common(impulse, result, data, &boxes_y, &scores, &classes, zero_point, scale, output_features_count);
+    return fill_result_struct_tao_yolo_common(impulse, result, &boxes_y, &scores, &classes, nboxes);
 #else
     return EI_IMPULSE_LAST_LAYER_NOT_AVAILABLE;
 #endif // #ifdef EI_HAS_TAO_YOLOV3
@@ -1436,13 +1437,15 @@ __attribute__((unused)) static EI_IMPULSE_ERROR fill_result_struct_f32_tao_yolov
                                                                                 float *data,
                                                                                 size_t output_features_count) {
 #ifdef EI_HAS_TAO_YOLOV4
-
     size_t col_size = 11 + impulse->label_count;
-    size_t row_count = output_features_count / col_size;
-
-    matrix_t boxes(row_count, 4 + impulse->label_count);
-    fill_result_struct_tao_yolov4_common(impulse, result, data, &boxes, 0.0f, 1.0f, output_features_count);
-    return fill_result_struct_tao_yolo_common(impulse, result, boxes.buffer, 0.0f, 1.0f, boxes.rows * boxes.cols);
+    size_t nboxes = output_features_count / col_size;
+
+    // (classes, nboxes, ...)
+    matrix_t boxes_y(nboxes * impulse->label_count, 4);
+    matrix_t scores(nboxes * impulse->label_count, 1);
+    matrix_i32_t classes(nboxes * impulse->label_count, 1);
+    fill_result_struct_tao_yolov4_common(impulse, result, data, &boxes_y, &scores, &classes, 0.0f, 1.0f, output_features_count);
+    return fill_result_struct_tao_yolo_common(impulse, result, &boxes_y, &scores, &classes, nboxes);
 #else
     return EI_IMPULSE_LAST_LAYER_NOT_AVAILABLE;
 #endif // #ifdef EI_HAS_TAO_YOLOV4
@@ -1459,12 +1462,16 @@ __attribute__((unused)) static EI_IMPULSE_ERROR fill_result_struct_quantized_tao
                                                                                       float scale,
                                                                                       size_t output_features_count) {
 #ifdef EI_HAS_TAO_YOLOV4
-    size_t col_size = 11 + impulse->label_count;
-    size_t row_count = output_features_count / col_size;
 
-    matrix_t boxes(row_count, 4 + impulse->label_count);
-    fill_result_struct_tao_yolov4_common(impulse, result, data, &boxes, zero_point, scale, output_features_count);
-    return fill_result_struct_tao_yolo_common(impulse, result, boxes.buffer, 0.0f, 1.0f, boxes.rows * boxes.cols);
+    size_t col_size = 11 + impulse->label_count;
+    size_t nboxes = output_features_count / col_size;
+
+    // (classes, nboxes, ...)
+    matrix_t boxes_y(nboxes * impulse->label_count, 4);
+    matrix_t scores(nboxes * impulse->label_count, 1);
+    matrix_i32_t classes(nboxes * impulse->label_count, 1);
+    fill_result_struct_tao_yolov4_common(impulse, result, data, &boxes_y, &scores, &classes, zero_point, scale, output_features_count);
+    return fill_result_struct_tao_yolo_common(impulse, result, &boxes_y, &scores, &classes, nboxes);
 #else
     return EI_IMPULSE_LAST_LAYER_NOT_AVAILABLE;
 #endif // #ifdef EI_HAS_TAO_YOLOV4
@@ -1474,7 +1481,7 @@ __attribute__((unused)) static EI_IMPULSE_ERROR fill_result_struct_quantized_tao
 #if EI_CLASSIFIER_SINGLE_FEATURE_INPUT == 0
 bool find_mtx_by_idx(ei_feature_t* mtx, ei::matrix_t** matrix, uint32_t mtx_id, size_t mtx_size) {
     for (size_t i = 0; i < mtx_size; i++) {
-        if (&mtx[i] == NULL) {
+        if (mtx[i].matrix == NULL) {
             continue;
         }
         if (mtx[i].blockId == mtx_id || mtx[i].blockId == 0) {
diff --git a/edgeimpulse/edge-impulse-sdk/classifier/ei_model_types.h b/edgeimpulse/edge-impulse-sdk/classifier/ei_model_types.h
index 1b44811..deda9a3 100644
--- a/edgeimpulse/edge-impulse-sdk/classifier/ei_model_types.h
+++ b/edgeimpulse/edge-impulse-sdk/classifier/ei_model_types.h
@@ -21,6 +21,7 @@
 #include <stdint.h>
 
 #include "edge-impulse-sdk/classifier/ei_classifier_types.h"
+#include "edge-impulse-sdk/dsp/ei_dsp_handle.h"
 #include "edge-impulse-sdk/dsp/numpy.hpp"
 #if EI_CLASSIFIER_USE_FULL_TFLITE || (EI_CLASSIFIER_INFERENCING_ENGINE == EI_CLASSIFIER_AKIDA) || (EI_CLASSIFIER_INFERENCING_ENGINE == EI_CLASSIFIER_MEMRYX)
 #include "tensorflow-lite/tensorflow/lite/c/common.h"
@@ -104,6 +105,9 @@ typedef struct {
     void *config;
     uint8_t *axes;
     size_t axes_size;
+    int version;  // future proof, can easily add to this struct now
+    DspHandle* (*factory)(void* config); // nullptr means no state
+    // v1 ends here
 } ei_model_dsp_t;
 
 typedef struct {
@@ -242,6 +246,70 @@ typedef struct ei_impulse {
     ei_object_detection_nms_config_t object_detection_nms;
 } ei_impulse_t;
 
+class ei_impulse_state_t {
+typedef DspHandle* _dsp_handle_ptr_t;
+public:
+    const ei_impulse_t *impulse; // keep a pointer to the impulse
+    _dsp_handle_ptr_t *dsp_handles;
+    bool is_temp_handle = false; // to know if we're using the old (stateless) API
+    ei_impulse_state_t(const ei_impulse_t *impulse)
+        : impulse(impulse)
+    {
+        const auto num_dsp_blocks = impulse->dsp_blocks_size;
+        dsp_handles = (_dsp_handle_ptr_t*)ei_malloc(sizeof(_dsp_handle_ptr_t)*num_dsp_blocks);
+        for(size_t ix = 0; ix < num_dsp_blocks; ix++) {
+            dsp_handles[ix] = nullptr;
+        }
+    }
+
+    DspHandle* get_dsp_handle(size_t ix) {
+        if (dsp_handles[ix] == nullptr) {
+            dsp_handles[ix] = impulse->dsp_blocks[ix].factory(impulse->dsp_blocks[ix].config);
+        }
+        return dsp_handles[ix];
+    }
+
+    void reset()
+    {
+        for (size_t ix = 0; ix < impulse->dsp_blocks_size; ix++) {
+            if (dsp_handles[ix] != nullptr) {
+                delete dsp_handles[ix];
+                dsp_handles[ix] = nullptr;
+            }
+        }
+    }
+
+    void* operator new(size_t size) {
+        return ei_malloc(size);
+    }
+
+    void operator delete(void* ptr) {
+        ei_free(ptr);
+    }
+
+    void* operator new[](size_t size) {
+        return ei_malloc(size);
+    }
+
+    void operator delete[](void* ptr) {
+        ei_free(ptr);
+    }
+
+    ~ei_impulse_state_t()
+    {
+        reset();
+        ei_free(dsp_handles);
+    }
+};
+
+class ei_impulse_handle_t {
+public:
+    ei_impulse_handle_t(const ei_impulse_t *impulse)
+        : state(impulse), impulse(impulse) {};
+    ei_impulse_state_t state;
+    const ei_impulse_t *impulse;
+};
+
 typedef struct {
     uint32_t block_id;
     uint16_t implementation_version;
diff --git a/edgeimpulse/edge-impulse-sdk/classifier/ei_nms.h b/edgeimpulse/edge-impulse-sdk/classifier/ei_nms.h
index e2c208e..0148860 100644
--- a/edgeimpulse/edge-impulse-sdk/classifier/ei_nms.h
+++ b/edgeimpulse/edge-impulse-sdk/classifier/ei_nms.h
@@ -190,7 +190,7 @@ static inline void NonMaxSuppression(const float* boxes, const int num_boxes,
         }
         ++*num_selected_indices;
       }
-      if (next_candidate.score > score_threshold) {
+      if ((soft_nms_sigma > 0.0) && (next_candidate.score > score_threshold)) {
         // Soft suppression might have occurred and current score is still
         // greater than score_threshold; add next_candidate back onto priority
         // queue.
@@ -205,45 +205,22 @@ static inline void NonMaxSuppression(const float* boxes, const int num_boxes,
  */
 EI_IMPULSE_ERROR ei_run_nms(
     const ei_impulse_t *impulse,
-    std::vector<ei_impulse_result_bounding_box_t> *results) {
+    std::vector<ei_impulse_result_bounding_box_t> *results,
+    float *boxes,
+    float *scores,
+    int *classes,
+    size_t bb_count,
+    bool clip_boxes) {
 
-    size_t bb_count = 0;
-    for (size_t ix = 0; ix < results->size(); ix++) {
-        auto bb = results->at(ix);
-        if (bb.value == 0) {
-            continue;
-        }
-        bb_count++;
-    }
-
-    float *boxes = (float*)ei_malloc(4 * bb_count * sizeof(float));
-    float *scores = (float*)ei_malloc(1 * bb_count * sizeof(float));
     int *selected_indices = (int*)ei_malloc(1 * bb_count * sizeof(int));
     float *selected_scores = (float*)ei_malloc(1 * bb_count * sizeof(float));
 
-    if (!scores || !boxes || !selected_indices || !selected_scores) {
-        ei_free(boxes);
-        ei_free(scores);
+    if (!scores || !boxes || !selected_indices || !selected_scores || !classes) {
         ei_free(selected_indices);
         ei_free(selected_scores);
         return EI_IMPULSE_OUT_OF_MEMORY;
     }
 
-    size_t box_ix = 0;
-    for (size_t ix = 0; ix < results->size(); ix++) {
-        auto bb = results->at(ix);
-        if (bb.value == 0) {
-            continue;
-        }
-        boxes[(box_ix * 4) + 0] = bb.y;
-        boxes[(box_ix * 4) + 1] = bb.x;
-        boxes[(box_ix * 4) + 2] = bb.y + bb.height;
-        boxes[(box_ix * 4) + 3] = bb.x + bb.width;
-        scores[box_ix] = bb.value;
-
-        box_ix++;
-    }
-
     //  boxes: box encodings in format [y1, x1, y2, x2], shape: [num_boxes, 4]
     //  num_boxes: number of candidates
     //  scores: scores for candidate boxes, in the same order. shape: [num_boxes]
@@ -269,17 +246,29 @@ EI_IMPULSE_ERROR ei_run_nms(
     std::vector<ei_impulse_result_bounding_box_t> new_results;
 
     for (size_t ix = 0; ix < (size_t)num_selected_indices; ix++) {
-        auto bb = results->at(selected_indices[ix]);
-
-        ei_printf("Found bb with label %s\n", bb.label);
+        // ei_printf("Found bb with label %s\n", bb.label);
 
+        int out_ix = selected_indices[ix];
         ei_impulse_result_bounding_box_t r;
-        r.label = bb.label;
-        r.x = bb.x;
-        r.y = bb.y;
-        r.width = bb.width;
-        r.height = bb.height;
-        r.value = selected_scores[ix];
+        r.label  = impulse->categories[classes[out_ix]];
+        r.value  = selected_scores[ix];
+
+        float ymin = boxes[(out_ix * 4) + 0];
+        float xmin = boxes[(out_ix * 4) + 1];
+        float ymax = boxes[(out_ix * 4) + 2];
+        float xmax = boxes[(out_ix * 4) + 3];
+
+        if (clip_boxes) {
+            ymin = std::min(std::max(ymin, 0.0f), (float)impulse->input_height);
+            xmin = std::min(std::max(xmin, 0.0f), (float)impulse->input_width);
+            ymax = std::min(std::max(ymax, 0.0f), (float)impulse->input_height);
+            xmax = std::min(std::max(xmax, 0.0f), (float)impulse->input_width);
+        }
+
+        r.y      = static_cast<uint32_t>(ymin);
+        r.x      = static_cast<uint32_t>(xmin);
+        r.height = static_cast<uint32_t>(ymax) - r.y;
+        r.width  = static_cast<uint32_t>(xmax) - r.x;
         new_results.push_back(r);
     }
 
@@ -289,8 +278,6 @@ EI_IMPULSE_ERROR ei_run_nms(
         results->push_back(new_results[ix]);
     }
 
-    ei_free(boxes);
-    ei_free(scores);
     ei_free(selected_indices);
     ei_free(selected_scores);
 
@@ -298,12 +285,73 @@ EI_IMPULSE_ERROR ei_run_nms(
 
 }
 
+/**
+ * Run non-max suppression over the results array (for bounding boxes)
+ */
+EI_IMPULSE_ERROR ei_run_nms(
+    const ei_impulse_t *impulse,
+    std::vector<ei_impulse_result_bounding_box_t> *results) {
+
+    size_t bb_count = 0;
+    for (size_t ix = 0; ix < results->size(); ix++) {
+        auto bb = results->at(ix);
+        if (bb.value == 0) {
+            continue;
+        }
+        bb_count++;
+    }
+
+    float *boxes = (float*)ei_malloc(4 * bb_count * sizeof(float));
+    float *scores = (float*)ei_malloc(1 * bb_count * sizeof(float));
+    int *classes = (int*) ei_malloc(bb_count * sizeof(int));
+
+    if (!scores || !boxes || !classes) {
+        ei_free(boxes);
+        ei_free(scores);
+        ei_free(classes);
+        return EI_IMPULSE_OUT_OF_MEMORY;
+    }
+
+    size_t box_ix = 0;
+    for (size_t ix = 0; ix < results->size(); ix++) {
+        auto bb = results->at(ix);
+        if (bb.value == 0) {
+            continue;
+        }
+        boxes[(box_ix * 4) + 0] = bb.y;
+        boxes[(box_ix * 4) + 1] = bb.x;
+        boxes[(box_ix * 4) + 2] = bb.y + bb.height;
+        boxes[(box_ix * 4) + 3] = bb.x + bb.width;
+        scores[box_ix] = bb.value;
+
+        for (size_t j = 0; j < impulse->label_count; j++) {
+          if (strcmp(impulse->categories[j], bb.label) == 0)
+          classes[box_ix] = j;
+        }
+
+        box_ix++;
+    }
+
+    EI_IMPULSE_ERROR nms_res = ei_run_nms(impulse, results,
+                                          boxes, scores,
+                                          classes, bb_count,
+                                          true /*clip_boxes*/);
+
+
+    ei_free(boxes);
+    ei_free(scores);
+    ei_free(classes);
+
+    return nms_res;
+
+}
+
 /**
  * Run non-max suppression over the results array (for bounding boxes)
  */
 EI_IMPULSE_ERROR ei_run_nms(std::vector<ei_impulse_result_bounding_box_t> *results) {
 #if EI_CLASSIFIER_HAS_MODEL_VARIABLES == 1
-  const ei_impulse_t impulse = ei_default_impulse;
+  auto& impulse = *ei_default_impulse.impulse;
 #else
   const ei_impulse_t impulse = {
     .object_detection_nms.confidence_threshold = 0.0f,
diff --git a/edgeimpulse/edge-impulse-sdk/classifier/ei_run_classifier.h b/edgeimpulse/edge-impulse-sdk/classifier/ei_run_classifier.h
index 8abdf2b..bf30c59 100644
--- a/edgeimpulse/edge-impulse-sdk/classifier/ei_run_classifier.h
+++ b/edgeimpulse/edge-impulse-sdk/classifier/ei_run_classifier.h
@@ -18,6 +18,7 @@
 #ifndef _EDGE_IMPULSE_RUN_CLASSIFIER_H_
 #define _EDGE_IMPULSE_RUN_CLASSIFIER_H_
 
+#include "ei_model_types.h"
 #include "model-parameters/model_metadata.h"
 
 #include "ei_run_dsp.h"
@@ -26,6 +27,8 @@
 #include "ei_performance_calibration.h"
 
 #include "edge-impulse-sdk/porting/ei_classifier_porting.h"
+#include "edge-impulse-sdk/porting/ei_logging.h"
+#include <memory>
 
 #if EI_CLASSIFIER_HAS_ANOMALY
 #include "inferencing_engines/anomaly.h"
@@ -61,6 +64,7 @@
 #error "Unknown inferencing engine"
 #endif
 
+// This file has an implicit dependency on ei_run_dsp.h, so must come after that include!
 #include "model-parameters/model_variables.h"
 
 #ifdef __cplusplus
@@ -68,7 +72,7 @@ namespace {
 #endif // __cplusplus
 
 /* Function prototypes ----------------------------------------------------- */
-extern "C" EI_IMPULSE_ERROR run_inference(const ei_impulse_t *impulse, ei_feature_t *fmatrix, ei_impulse_result_t *result, bool debug);
+extern "C" EI_IMPULSE_ERROR run_inference(ei_impulse_handle_t *handle, ei_feature_t *fmatrix, ei_impulse_result_t *result, bool debug);
 extern "C" EI_IMPULSE_ERROR run_classifier_image_quantized(const ei_impulse_t *impulse, signal_t *signal, ei_impulse_result_t *result, bool debug);
 static EI_IMPULSE_ERROR can_run_classifier_image_quantized(const ei_impulse_t *impulse, ei_learning_block_t block_ptr);
 
@@ -99,6 +103,7 @@ __attribute__((unused)) void display_results(ei_impulse_result_t* result)
     ei_printf("Predictions (DSP: %d ms., Classification: %d ms., Anomaly: %d ms.): \n",
                 result->timing.dsp, result->timing.classification, result->timing.anomaly);
 #if EI_CLASSIFIER_OBJECT_DETECTION == 1
+    ei_printf("#Object detection results:\r\n");
     bool bb_found = result->bounding_boxes[0].value > 0;
     for (size_t ix = 0; ix < result->bounding_boxes_count; ix++) {
         auto bb = result->bounding_boxes[ix];
@@ -113,17 +118,28 @@ __attribute__((unused)) void display_results(ei_impulse_result_t* result)
     if (!bb_found) {
         ei_printf("    No objects found\n");
     }
-#else
+#elif EI_CLASSIFIER_LABEL_COUNT > 1 // if there is only one label, this is an anomaly only
+    ei_printf("#Classification results:\r\n");
     for (size_t ix = 0; ix < EI_CLASSIFIER_LABEL_COUNT; ix++) {
         ei_printf("    %s: ", result->classification[ix].label);
         ei_printf_float(result->classification[ix].value);
         ei_printf("\n");
     }
-#if EI_CLASSIFIER_HAS_ANOMALY
-    ei_printf("    anomaly score: ");
-    ei_printf_float(result->anomaly);
-    ei_printf("\n");
 #endif
+#if EI_CLASSIFIER_HAS_ANOMALY == 3 // visual AD
+    ei_printf("#Visual anomaly grid results:\r\n");
+    for (uint32_t i = 0; i < result->visual_ad_count; i++) {
+        ei_impulse_result_bounding_box_t bb = result->visual_ad_grid_cells[i];
+        if (bb.value == 0) {
+            continue;
+        }
+        ei_printf("    %s (", bb.label);
+        ei_printf_float(bb.value);
+        ei_printf(") [ x: %u, y: %u, width: %u, height: %u ]\n", bb.x, bb.y, bb.width, bb.height);
+    }
+    ei_printf("Visual anomaly values: Mean %.3f Max %.3f\r\n", result->visual_ad_result.mean_value, result->visual_ad_result.max_value);
+#elif (EI_CLASSIFIER_HAS_ANOMALY > 0) // except for visual AD
+    ei_printf("Anomaly prediction: %.3f\r\n", result->anomaly);
 #endif
 }
 
@@ -138,11 +154,12 @@ __attribute__((unused)) void display_results(ei_impulse_result_t* result)
  * @return     The ei impulse error.
  */
 extern "C" EI_IMPULSE_ERROR run_inference(
-    const ei_impulse_t *impulse,
+    ei_impulse_handle_t *handle,
     ei_feature_t *fmatrix,
     ei_impulse_result_t *result,
     bool debug = false)
 {
+    auto& impulse = handle->impulse;
     for (size_t ix = 0; ix < impulse->learning_blocks_size; ix++) {
 
         ei_learning_block_t block = impulse->learning_blocks[ix];
@@ -185,26 +202,30 @@ extern "C" EI_IMPULSE_ERROR run_inference(
  * @param      impulse  struct with information about model and DSP
  * @param      signal   Sample data
  * @param      result   Output classifier results
+ * @param      handle   Handle from open_impulse. nullptr for backward compatibility
  * @param[in]  debug    Debug output enable
  *
  * @return     The ei impulse error.
  */
-extern "C" EI_IMPULSE_ERROR process_impulse(const ei_impulse_t *impulse,
+extern "C" EI_IMPULSE_ERROR process_impulse(ei_impulse_handle_t *handle,
                                             signal_t *signal,
                                             ei_impulse_result_t *result,
                                             bool debug = false)
 {
+    if(!handle) {
+        return EI_IMPULSE_INFERENCE_ERROR;
+    }
 
 #if (EI_CLASSIFIER_QUANTIZATION_ENABLED == 1 && (EI_CLASSIFIER_INFERENCING_ENGINE == EI_CLASSIFIER_TFLITE || EI_CLASSIFIER_INFERENCING_ENGINE == EI_CLASSIFIER_TENSAIFLOW || EI_CLASSIFIER_INFERENCING_ENGINE == EI_CLASSIFIER_ONNX_TIDL)) || EI_CLASSIFIER_INFERENCING_ENGINE == EI_CLASSIFIER_DRPAI
     // Shortcut for quantized image models
-    ei_learning_block_t block = impulse->learning_blocks[0];
-    if (can_run_classifier_image_quantized(impulse, block) == EI_IMPULSE_OK) {
-        return run_classifier_image_quantized(impulse, signal, result, debug);
+    ei_learning_block_t block = handle->impulse->learning_blocks[0];
+    if (can_run_classifier_image_quantized(handle->impulse, block) == EI_IMPULSE_OK) {
+        return run_classifier_image_quantized(handle->impulse, signal, result, debug);
     }
 #endif
 
     memset(result, 0, sizeof(ei_impulse_result_t));
-    uint32_t block_num = impulse->dsp_blocks_size + impulse->learning_blocks_size;
+    uint32_t block_num = handle->impulse->dsp_blocks_size + handle->impulse->learning_blocks_size;
 
     // smart pointer to features array
     std::unique_ptr<ei_feature_t[]> features_ptr(new ei_feature_t[block_num]);
@@ -218,30 +239,50 @@ extern "C" EI_IMPULSE_ERROR process_impulse(const ei_impulse_t *impulse,
 
     size_t out_features_index = 0;
 
-    for (size_t ix = 0; ix < impulse->dsp_blocks_size; ix++) {
-        ei_model_dsp_t block = impulse->dsp_blocks[ix];
+    for (size_t ix = 0; ix < handle->impulse->dsp_blocks_size; ix++) {
+        ei_model_dsp_t block = handle->impulse->dsp_blocks[ix];
         matrix_ptrs[ix] = std::unique_ptr<ei::matrix_t>(new ei::matrix_t(1, block.n_output_features));
         features[ix].matrix = matrix_ptrs[ix].get();
         features[ix].blockId = block.blockId;
 
-        if (out_features_index + block.n_output_features > impulse->nn_input_frame_size) {
+        if (out_features_index + block.n_output_features > handle->impulse->nn_input_frame_size) {
             ei_printf("ERR: Would write outside feature buffer\n");
             delete[] matrix_ptrs;
             return EI_IMPULSE_DSP_ERROR;
         }
 
 #if EIDSP_SIGNAL_C_FN_POINTER
-        if (block.axes_size != impulse->raw_samples_per_frame) {
+        if (block.axes_size != handle->impulse->raw_samples_per_frame) {
             ei_printf("ERR: EIDSP_SIGNAL_C_FN_POINTER can only be used when all axes are selected for DSP blocks\n");
             delete[] matrix_ptrs;
             return EI_IMPULSE_DSP_ERROR;
         }
-        int ret = block.extract_fn(signal, features[ix].matrix, block.config, impulse->frequency);
+        auto internal_signal = signal;
 #else
-        SignalWithAxes swa(signal, block.axes, block.axes_size, impulse);
-        int ret = block.extract_fn(swa.get_signal(), features[ix].matrix, block.config, impulse->frequency);
+        SignalWithAxes swa(signal, block.axes, block.axes_size, handle->impulse);
+        auto internal_signal = swa.get_signal();
 #endif
 
+        int ret;
+        if (block.factory) { // ie, if we're using state
+            // Msg user
+            static bool has_printed = false;
+            if (!has_printed) {
+                EI_LOGI("Impulse maintains state. Call run_classifier_init() to reset state (e.g. if data stream is interrupted.)\n");
+                has_printed = true;
+            }
+
+            // getter has a lazy init, so we can just call it
+            auto dsp_handle = handle->state.get_dsp_handle(ix);
+            if(dsp_handle) {
+                ret = dsp_handle->extract(internal_signal, features[ix].matrix, block.config, handle->impulse->frequency);
+            } else {
+                return EI_IMPULSE_OUT_OF_MEMORY;
+            }
+        } else {
+            ret = block.extract_fn(internal_signal, features[ix].matrix, block.config, handle->impulse->frequency);
+        }
+
         if (ret != EIDSP_OK) {
             ei_printf("ERR: Failed to run DSP process (%d)\n", ret);
             delete[] matrix_ptrs;
@@ -257,13 +298,13 @@ extern "C" EI_IMPULSE_ERROR process_impulse(const ei_impulse_t *impulse,
     }
 
 #if EI_CLASSIFIER_SINGLE_FEATURE_INPUT == 0
-    for (size_t ix = 0; ix < impulse->learning_blocks_size; ix++) {
-        ei_learning_block_t block = impulse->learning_blocks[ix];
+    for (size_t ix = 0; ix < handle->impulse->learning_blocks_size; ix++) {
+        ei_learning_block_t block = handle->impulse->learning_blocks[ix];
 
         if (block.keep_output) {
-            matrix_ptrs[impulse->dsp_blocks_size + ix] = std::unique_ptr<ei::matrix_t>(new ei::matrix_t(1, block.output_features_count));
-            features[impulse->dsp_blocks_size + ix].matrix = matrix_ptrs[impulse->dsp_blocks_size + ix].get();
-            features[impulse->dsp_blocks_size + ix].blockId = block.blockId;
+            matrix_ptrs[handle->impulse->dsp_blocks_size + ix] = std::unique_ptr<ei::matrix_t>(new ei::matrix_t(1, block.output_features_count));
+            features[handle->impulse->dsp_blocks_size + ix].matrix = matrix_ptrs[handle->impulse->dsp_blocks_size + ix].get();
+            features[handle->impulse->dsp_blocks_size + ix].blockId = block.blockId;
         }
     }
 #endif // EI_CLASSIFIER_SINGLE_FEATURE_INPUT
@@ -289,13 +330,26 @@ extern "C" EI_IMPULSE_ERROR process_impulse(const ei_impulse_t *impulse,
         ei_printf("Running impulse...\n");
     }
 
-    EI_IMPULSE_ERROR res = run_inference(impulse, features, result, debug);
-
+    EI_IMPULSE_ERROR res = run_inference(handle, features, result, debug);
     delete[] matrix_ptrs;
-
     return res;
 }
 
+/**
+ * @brief      Opens an impulse
+ *
+ * @param      impulse  struct with information about model and DSP
+ *
+ * @return     A pointer to the impulse handle, or nullptr if memory allocation failed.
+ */
+extern "C" EI_IMPULSE_ERROR init_impulse(ei_impulse_handle_t *handle) {
+    if (!handle) {
+        return EI_IMPULSE_OUT_OF_MEMORY;
+    }
+    handle->state.reset();
+    return EI_IMPULSE_OK;
+}
+
 /**
  * @brief      Process a complete impulse for continuous inference
  *
@@ -306,13 +360,13 @@ extern "C" EI_IMPULSE_ERROR process_impulse(const ei_impulse_t *impulse,
  *
  * @return     The ei impulse error.
  */
-extern "C" EI_IMPULSE_ERROR process_impulse_continuous(const ei_impulse_t *impulse,
+extern "C" EI_IMPULSE_ERROR process_impulse_continuous(ei_impulse_handle_t *handle,
                                             signal_t *signal,
                                             ei_impulse_result_t *result,
                                             bool debug,
                                             bool enable_maf)
 {
-
+    auto impulse = handle->impulse;
     static ei::matrix_t static_features_matrix(1, impulse->nn_input_frame_size);
     if (!static_features_matrix.buffer) {
         return EI_IMPULSE_ALLOC_FAILED;
@@ -438,7 +492,7 @@ extern "C" EI_IMPULSE_ERROR process_impulse_continuous(const ei_impulse_t *impul
             ei_printf("Running impulse...\n");
         }
 
-        ei_impulse_error = run_inference(impulse, features, result, debug);
+        ei_impulse_error = run_inference(handle, features, result, debug);
 
 #if EI_CLASSIFIER_CALIBRATION_ENABLED
         if (impulse->sensor == EI_CLASSIFIER_SENSOR_MICROPHONE) {
@@ -656,10 +710,11 @@ extern "C" void run_classifier_init()
 
     classifier_continuous_features_written = 0;
     ei_dsp_clear_continuous_audio_state();
+    init_impulse(&ei_default_impulse);
 
 #if EI_CLASSIFIER_CALIBRATION_ENABLED
 
-    const ei_impulse_t impulse = ei_default_impulse;
+    const auto impulse = ei_default_impulse.impulse;
     const ei_model_performance_calibration_t *calibration = &impulse.calibration;
 
     if(calibration != NULL) {
@@ -672,13 +727,14 @@ extern "C" void run_classifier_init()
 /**
  * @brief      Init static vars, for multi-model support
  */
-__attribute__((unused)) void run_classifier_init(const ei_impulse_t *impulse)
+__attribute__((unused)) void run_classifier_init(ei_impulse_handle_t *handle)
 {
     classifier_continuous_features_written = 0;
     ei_dsp_clear_continuous_audio_state();
+    init_impulse(handle);
 
 #if EI_CLASSIFIER_CALIBRATION_ENABLED
-    const ei_model_performance_calibration_t *calibration = &impulse->calibration;
+    const ei_model_performance_calibration_t *calibration = &handle->impulse->calibration;
 
     if(calibration != NULL) {
         avg_scores = new RecognizeEvents(calibration,
@@ -710,7 +766,7 @@ extern "C" EI_IMPULSE_ERROR run_classifier_continuous(
     bool debug = false,
     bool enable_maf = true)
 {
-    const ei_impulse_t impulse = ei_default_impulse;
+    auto& impulse = ei_default_impulse;
     return process_impulse_continuous(&impulse, signal, result, debug, enable_maf);
 }
 
@@ -726,7 +782,7 @@ extern "C" EI_IMPULSE_ERROR run_classifier_continuous(
  * @return     The ei impulse error.
  */
 __attribute__((unused)) EI_IMPULSE_ERROR run_classifier_continuous(
-    const ei_impulse_t *impulse,
+    ei_impulse_handle_t *impulse,
     signal_t *signal,
     ei_impulse_result_t *result,
     bool debug = false,
@@ -747,8 +803,7 @@ extern "C" EI_IMPULSE_ERROR run_classifier(
     ei_impulse_result_t *result,
     bool debug = false)
 {
-    const ei_impulse_t impulse = ei_default_impulse;
-    return process_impulse(&impulse, signal, result, debug);
+    return process_impulse(&ei_default_impulse, signal, result, debug);
 }
 
 /**
@@ -760,7 +815,7 @@ extern "C" EI_IMPULSE_ERROR run_classifier(
  * @param debug Whether to show debug messages (default: false)
  */
 __attribute__((unused)) EI_IMPULSE_ERROR run_classifier(
-    const ei_impulse_t *impulse,
+    ei_impulse_handle_t *impulse,
     signal_t *signal,
     ei_impulse_result_t *result,
     bool debug = false)
@@ -794,7 +849,7 @@ __attribute__((unused)) EI_IMPULSE_ERROR run_impulse(
 #endif
         bool debug = false) {
 
-    const ei_impulse_t impulse = ei_default_impulse;
+    auto& impulse = *(ei_default_impulse.impulse);
 
     float *x = (float*)calloc(impulse.dsp_input_frame_size, sizeof(float));
     if (!x) {
diff --git a/edgeimpulse/edge-impulse-sdk/classifier/ei_run_dsp.h b/edgeimpulse/edge-impulse-sdk/classifier/ei_run_dsp.h
index d04c144..4f17585 100644
--- a/edgeimpulse/edge-impulse-sdk/classifier/ei_run_dsp.h
+++ b/edgeimpulse/edge-impulse-sdk/classifier/ei_run_dsp.h
@@ -22,6 +22,7 @@
 #include "edge-impulse-sdk/dsp/spectral/spectral.hpp"
 #include "edge-impulse-sdk/dsp/speechpy/speechpy.hpp"
 #include "edge-impulse-sdk/classifier/ei_signal_with_range.h"
+#include "edge-impulse-sdk/dsp/ei_flatten.h"
 #include "model-parameters/model_metadata.h"
 
 #if defined(__cplusplus) && EI_C_LINKAGE == 1
@@ -136,104 +137,10 @@ __attribute__((unused)) int extract_raw_features(signal_t *signal, matrix_t *out
 }
 
 __attribute__((unused)) int extract_flatten_features(signal_t *signal, matrix_t *output_matrix, void *config_ptr, const float frequency) {
-    ei_dsp_config_flatten_t config = *((ei_dsp_config_flatten_t*)config_ptr);
-
-    uint32_t expected_matrix_size = 0;
-    if (config.average) expected_matrix_size += config.axes;
-    if (config.minimum) expected_matrix_size += config.axes;
-    if (config.maximum) expected_matrix_size += config.axes;
-    if (config.rms) expected_matrix_size += config.axes;
-    if (config.stdev) expected_matrix_size += config.axes;
-    if (config.skewness) expected_matrix_size += config.axes;
-    if (config.kurtosis) expected_matrix_size += config.axes;
-
-    if (output_matrix->rows * output_matrix->cols != expected_matrix_size) {
-        EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
-    }
-
-    int ret;
-
-    // input matrix from the raw signal
-    matrix_t input_matrix(signal->total_length / config.axes, config.axes);
-    if (!input_matrix.buffer) {
-        EIDSP_ERR(EIDSP_OUT_OF_MEM);
-    }
-    signal->get_data(0, signal->total_length, input_matrix.buffer);
-
-    // scale the signal
-    ret = numpy::scale(&input_matrix, config.scale_axes);
-    if (ret != EIDSP_OK) {
-        ei_printf("ERR: Failed to scale signal (%d)\n", ret);
-        EIDSP_ERR(ret);
-    }
-
-    // transpose the matrix so we have one row per axis (nifty!)
-    ret = numpy::transpose(&input_matrix);
-    if (ret != EIDSP_OK) {
-        ei_printf("ERR: Failed to transpose matrix (%d)\n", ret);
-        EIDSP_ERR(ret);
-    }
-
-    size_t out_matrix_ix = 0;
-
-    for (size_t row = 0; row < input_matrix.rows; row++) {
-        matrix_t row_matrix(1, input_matrix.cols, input_matrix.buffer + (row * input_matrix.cols));
-
-        if (config.average) {
-            float fbuffer;
-            matrix_t out_matrix(1, 1, &fbuffer);
-            numpy::mean(&row_matrix, &out_matrix);
-            output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
-        }
-
-        if (config.minimum) {
-            float fbuffer;
-            matrix_t out_matrix(1, 1, &fbuffer);
-            numpy::min(&row_matrix, &out_matrix);
-            output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
-        }
-
-        if (config.maximum) {
-            float fbuffer;
-            matrix_t out_matrix(1, 1, &fbuffer);
-            numpy::max(&row_matrix, &out_matrix);
-            output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
-        }
-
-        if (config.rms) {
-            float fbuffer;
-            matrix_t out_matrix(1, 1, &fbuffer);
-            numpy::rms(&row_matrix, &out_matrix);
-            output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
-        }
-
-        if (config.stdev) {
-            float fbuffer;
-            matrix_t out_matrix(1, 1, &fbuffer);
-            numpy::stdev(&row_matrix, &out_matrix);
-            output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
-        }
-
-        if (config.skewness) {
-            float fbuffer;
-            matrix_t out_matrix(1, 1, &fbuffer);
-            numpy::skew(&row_matrix, &out_matrix);
-            output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
-        }
-
-        if (config.kurtosis) {
-            float fbuffer;
-            matrix_t out_matrix(1, 1, &fbuffer);
-            numpy::kurtosis(&row_matrix, &out_matrix);
-            output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
-        }
-    }
-
-    // flatten again
-    output_matrix->cols = output_matrix->rows * output_matrix->cols;
-    output_matrix->rows = 1;
-
-    return EIDSP_OK;
+    auto handle = flatten_class::create(config_ptr);
+    auto ret = handle->extract(signal, output_matrix, config_ptr, frequency);
+    delete handle;
+    return ret;
 }
 
 static class speechpy::processing::preemphasis *preemphasis;
@@ -303,7 +210,7 @@ __attribute__((unused)) int extract_mfcc_features(signal_t *signal, matrix_t *ou
 }
 
 
-static int extract_mfcc_run_slice(signal_t *signal, matrix_t *output_matrix, ei_dsp_config_mfcc_t *config, const float sampling_frequency, matrix_size_t *matrix_size_out, int implementation_version) {
+__attribute__((unused)) static int extract_mfcc_run_slice(signal_t *signal, matrix_t *output_matrix, ei_dsp_config_mfcc_t *config, const float sampling_frequency, matrix_size_t *matrix_size_out, int implementation_version) {
     uint32_t frequency = (uint32_t)sampling_frequency;
 
     int x;
@@ -571,7 +478,7 @@ __attribute__((unused)) int extract_spectrogram_features(signal_t *signal, matri
 }
 
 
-static int extract_spectrogram_run_slice(signal_t *signal, matrix_t *output_matrix, ei_dsp_config_spectrogram_t *config, const float sampling_frequency, matrix_size_t *matrix_size_out) {
+__attribute__((unused)) static int extract_spectrogram_run_slice(signal_t *signal, matrix_t *output_matrix, ei_dsp_config_spectrogram_t *config, const float sampling_frequency, matrix_size_t *matrix_size_out) {
     uint32_t frequency = (uint32_t)sampling_frequency;
 
     int x;
@@ -882,7 +789,7 @@ __attribute__((unused)) int extract_mfe_features(signal_t *signal, matrix_t *out
     return EIDSP_OK;
 }
 
-static int extract_mfe_run_slice(signal_t *signal, matrix_t *output_matrix, ei_dsp_config_mfe_t *config, const float sampling_frequency, matrix_size_t *matrix_size_out) {
+__attribute__((unused)) static int extract_mfe_run_slice(signal_t *signal, matrix_t *output_matrix, ei_dsp_config_mfe_t *config, const float sampling_frequency, matrix_size_t *matrix_size_out) {
     uint32_t frequency = (uint32_t)sampling_frequency;
 
     int x;
diff --git a/edgeimpulse/edge-impulse-sdk/dsp/ei_dsp_handle.h b/edgeimpulse/edge-impulse-sdk/dsp/ei_dsp_handle.h
new file mode 100644
index 0000000..462117a
--- /dev/null
+++ b/edgeimpulse/edge-impulse-sdk/dsp/ei_dsp_handle.h
@@ -0,0 +1,58 @@
+/* Edge Impulse inferencing library
+ * Copyright (c) 2023 EdgeImpulse Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __EI_DSP_HANDLE__H__
+#define __EI_DSP_HANDLE__H__
+
+#include "edge-impulse-sdk/dsp/config.hpp"
+#include "edge-impulse-sdk/dsp/numpy_types.h"
+
+class DspHandle {
+public:
+    /**
+     * @brief Override and call ei_printf to print debug information, especially the current state
+     *
+     * @return int
+     */
+    virtual int print() = 0;
+
+    /**
+     * @brief Override and convert raw data into processed features. Any state should live inside your custom class.
+     * Provide a constructor to initialize your state.
+     *
+     * @param signal Callback object to get raw data from
+     * @param output_matrix Output matrix to write features to
+     * @param config Configuration object, generated by Studio based on your DSP block parameters
+     * @param frequency Sampling frequency, as set in your project
+     * @return int 0 on success, anything else for failure
+     */
+    virtual int extract(ei::signal_t *signal, ei::matrix_t *output_matrix, void *config, const float frequency) = 0;
+
+    // Must declare so user can override
+    /**
+     * @brief If you call new or ei_malloc anywhere in your class, you must override this function and delete your objects
+     *
+     */
+    virtual ~DspHandle() {};
+};
+
+#endif  //!__EI_DSP_HANDLE__H__
\ No newline at end of file
diff --git a/edgeimpulse/edge-impulse-sdk/dsp/ei_flatten.h b/edgeimpulse/edge-impulse-sdk/dsp/ei_flatten.h
new file mode 100644
index 0000000..8802543
--- /dev/null
+++ b/edgeimpulse/edge-impulse-sdk/dsp/ei_flatten.h
@@ -0,0 +1,198 @@
+/* Edge Impulse inferencing library
+ * Copyright (c) 2023 EdgeImpulse Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __EI_FLATTEN__H__
+#define __EI_FLATTEN__H__
+
+#include "edge-impulse-sdk/dsp/ei_vector.h"
+#include "edge-impulse-sdk/dsp/returntypes.hpp"
+#include "edge-impulse-sdk/dsp/ei_dsp_handle.h"
+#include "model-parameters/model_metadata.h"
+#include "edge-impulse-sdk/dsp/numpy.hpp"
+#include "edge-impulse-sdk/dsp/config.hpp"
+
+class flatten_class : public DspHandle {
+public:
+    int print() override {
+        ei_printf("means: ");
+        for(int axis = 0; (size_t)axis < this->means.size(); axis++) {
+        ei_printf("axis: %i\n", axis);
+            for (size_t i = 0; i < this->means.size(); i++) {
+                ei_printf("%f ", this->means[axis][i]);
+            }
+        }
+        ei_printf("\n");
+        return ei::EIDSP_OK;
+    }
+
+    int extract(ei::signal_t *signal, ei::matrix_t *output_matrix, void *config_ptr, const float frequency) override {
+        using namespace ei;
+
+        ei_dsp_config_flatten_t config = *((ei_dsp_config_flatten_t*)config_ptr);
+
+        uint32_t expected_matrix_size = 0;
+        if (config.average) expected_matrix_size += config.axes;
+        if (config.minimum) expected_matrix_size += config.axes;
+        if (config.maximum) expected_matrix_size += config.axes;
+        if (config.rms) expected_matrix_size += config.axes;
+        if (config.stdev) expected_matrix_size += config.axes;
+        if (config.skewness) expected_matrix_size += config.axes;
+        if (config.kurtosis) expected_matrix_size += config.axes;
+        if (config.moving_avg_num_windows) expected_matrix_size += config.axes;
+
+        if (output_matrix->rows * output_matrix->cols != expected_matrix_size) {
+            EIDSP_ERR(EIDSP_MATRIX_SIZE_MISMATCH);
+        }
+
+        int ret;
+
+        // input matrix from the raw signal
+        matrix_t input_matrix(signal->total_length / config.axes, config.axes);
+        if (!input_matrix.buffer) {
+            EIDSP_ERR(EIDSP_OUT_OF_MEM);
+        }
+        signal->get_data(0, signal->total_length, input_matrix.buffer);
+
+        // scale the signal
+        ret = numpy::scale(&input_matrix, config.scale_axes);
+        if (ret != EIDSP_OK) {
+            ei_printf("ERR: Failed to scale signal (%d)\n", ret);
+            EIDSP_ERR(ret);
+        }
+
+        // transpose the matrix so we have one row per axis
+        numpy::transpose_in_place(&input_matrix);
+
+        size_t out_matrix_ix = 0;
+
+        for (size_t row = 0; row < input_matrix.rows; row++) {
+            matrix_t row_matrix(1, input_matrix.cols, input_matrix.buffer + (row * input_matrix.cols));
+
+            float mean; // to use with moving average
+
+            if (config.average || config.moving_avg_num_windows) {
+                float fbuffer;
+                matrix_t out_matrix(1, 1, &fbuffer);
+                numpy::mean(&row_matrix, &out_matrix);
+                mean = out_matrix.buffer[0];
+                if (config.average) {
+                    output_matrix->buffer[out_matrix_ix++] = mean;
+                }
+            }
+
+            if (config.minimum) {
+                float fbuffer;
+                matrix_t out_matrix(1, 1, &fbuffer);
+                numpy::min(&row_matrix, &out_matrix);
+                output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
+            }
+
+            if (config.maximum) {
+                float fbuffer;
+                matrix_t out_matrix(1, 1, &fbuffer);
+                numpy::max(&row_matrix, &out_matrix);
+                output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
+            }
+
+            if (config.rms) {
+                float fbuffer;
+                matrix_t out_matrix(1, 1, &fbuffer);
+                numpy::rms(&row_matrix, &out_matrix);
+                output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
+            }
+
+            if (config.stdev) {
+                float fbuffer;
+                matrix_t out_matrix(1, 1, &fbuffer);
+                numpy::stdev(&row_matrix, &out_matrix);
+                output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
+            }
+
+            if (config.skewness) {
+                float fbuffer;
+                matrix_t out_matrix(1, 1, &fbuffer);
+                numpy::skew(&row_matrix, &out_matrix);
+                output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
+            }
+
+            if (config.kurtosis) {
+                float fbuffer;
+                matrix_t out_matrix(1, 1, &fbuffer);
+                numpy::kurtosis(&row_matrix, &out_matrix);
+                output_matrix->buffer[out_matrix_ix++] = out_matrix.buffer[0];
+            }
+
+            if (config.moving_avg_num_windows) {
+                push_mean(row, mean);
+                output_matrix->buffer[out_matrix_ix++] = numpy::mean(means[row].data(), means[row].size());
+            }
+        }
+
+        // flatten again
+        output_matrix->cols = output_matrix->rows * output_matrix->cols;
+        output_matrix->rows = 1;
+
+        return EIDSP_OK;
+    }
+
+    static DspHandle* create(void* config);
+
+    void* operator new(size_t size) {
+        // Custom memory allocation logic here
+        return ei_malloc(size);
+    }
+
+    void operator delete(void* ptr) {
+        // Custom memory deallocation logic here
+        ei_free(ptr);
+    }
+
+private:
+    ei_vector<ei_vector<float>> means;
+    ei_vector<size_t> head_indexes;
+    size_t moving_avg_num_windows;
+
+    flatten_class(int moving_avg_num_windows, int axes_count) : means(axes_count), head_indexes(axes_count, 0) {
+        this->moving_avg_num_windows = moving_avg_num_windows;
+    }
+
+    void push_mean(int axis, float mean) {
+        auto& head = head_indexes[axis];
+        if (head_indexes[axis] >= means[axis].size()) {
+            means[axis].push_back(mean);
+        } else {
+            means[axis][head] = mean;
+        }
+        head = head + 1;
+        // This is a lot cheaper than mod (%)
+        if (head >= moving_avg_num_windows) {
+            head = 0;
+        }
+    }
+};
+
+DspHandle* flatten_class::create(void* config_in) { // NOLINT def in header is OK at EI
+    auto config = reinterpret_cast<ei_dsp_config_flatten_t*>(config_in);
+    return new flatten_class(config->moving_avg_num_windows, config->axes);
+};
+
+#endif  //!__EI_FLATTEN__H__
\ No newline at end of file
diff --git a/edgeimpulse/edge-impulse-sdk/dsp/numpy_types.h b/edgeimpulse/edge-impulse-sdk/dsp/numpy_types.h
index 4fda745..9d02538 100644
--- a/edgeimpulse/edge-impulse-sdk/dsp/numpy_types.h
+++ b/edgeimpulse/edge-impulse-sdk/dsp/numpy_types.h
@@ -30,9 +30,7 @@
 #endif // __MBED__
 #endif // __cplusplus
 #include "config.hpp"
-
-#include "../porting/ei_classifier_porting.h"
-
+#include "edge-impulse-sdk/dsp/returntypes.h"
 
 #if EIDSP_TRACK_ALLOCATIONS
 #include "memory.hpp"
diff --git a/edgeimpulse/edge-impulse-sdk/dsp/returntypes.h b/edgeimpulse/edge-impulse-sdk/dsp/returntypes.h
new file mode 100644
index 0000000..7f8960f
--- /dev/null
+++ b/edgeimpulse/edge-impulse-sdk/dsp/returntypes.h
@@ -0,0 +1,35 @@
+#ifndef _EIDSP_RETURN_TYPES_H_
+#define _EIDSP_RETURN_TYPES_H_
+
+#include <stdint.h>
+
+// outside of namespace for backwards compat
+typedef enum {
+    EI_IMPULSE_OK = 0,
+    EI_IMPULSE_ERROR_SHAPES_DONT_MATCH = -1,
+    EI_IMPULSE_CANCELED = -2,
+    EI_IMPULSE_TFLITE_ERROR = -3,
+    EI_IMPULSE_DSP_ERROR = -5,
+    EI_IMPULSE_TFLITE_ARENA_ALLOC_FAILED = -6,
+    EI_IMPULSE_CUBEAI_ERROR = -7,
+    EI_IMPULSE_ALLOC_FAILED = -8,
+    EI_IMPULSE_ONLY_SUPPORTED_FOR_IMAGES = -9,
+    EI_IMPULSE_UNSUPPORTED_INFERENCING_ENGINE = -10,
+    EI_IMPULSE_OUT_OF_MEMORY = -11,
+    EI_IMPULSE_INPUT_TENSOR_WAS_NULL = -13,
+    EI_IMPULSE_OUTPUT_TENSOR_WAS_NULL = -14,
+    EI_IMPULSE_SCORE_TENSOR_WAS_NULL = -15,
+    EI_IMPULSE_LABEL_TENSOR_WAS_NULL = -16,
+    EI_IMPULSE_TENSORRT_INIT_FAILED = -17,
+    EI_IMPULSE_DRPAI_INIT_FAILED = -18,
+    EI_IMPULSE_DRPAI_RUNTIME_FAILED = -19,
+    EI_IMPULSE_DEPRECATED_MODEL = -20,
+    EI_IMPULSE_LAST_LAYER_NOT_AVAILABLE = -21,
+    EI_IMPULSE_INFERENCE_ERROR = -22,
+    EI_IMPULSE_AKIDA_ERROR = -23,
+    EI_IMPULSE_INVALID_SIZE = -24,
+    EI_IMPULSE_ONNX_ERROR = -25,
+    EI_IMPULSE_MEMRYX_ERROR = -26,
+} EI_IMPULSE_ERROR;
+
+#endif // _EIDSP_RETURN_TYPES_H_
\ No newline at end of file
diff --git a/edgeimpulse/edge-impulse-sdk/dsp/returntypes.hpp b/edgeimpulse/edge-impulse-sdk/dsp/returntypes.hpp
index 01cdbf6..f8eba0a 100644
--- a/edgeimpulse/edge-impulse-sdk/dsp/returntypes.hpp
+++ b/edgeimpulse/edge-impulse-sdk/dsp/returntypes.hpp
@@ -15,10 +15,11 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#ifndef _EIDSP_RETURN_TYPES_H_
-#define _EIDSP_RETURN_TYPES_H_
+#ifndef _EIDSP_RETURN_TYPES_HPP_
+#define _EIDSP_RETURN_TYPES_HPP_
 
 #include <stdint.h>
+#include "returntypes.h"
 
 namespace ei {
 
diff --git a/edgeimpulse/edge-impulse-sdk/dsp/speechpy/processing.hpp b/edgeimpulse/edge-impulse-sdk/dsp/speechpy/processing.hpp
index b7e87e5..5b34b1b 100644
--- a/edgeimpulse/edge-impulse-sdk/dsp/speechpy/processing.hpp
+++ b/edgeimpulse/edge-impulse-sdk/dsp/speechpy/processing.hpp
@@ -203,7 +203,7 @@ namespace processing {
      * @param frame_stride (float): The stride between frames.
      * @returns Number of frames required, or a negative number if an error occured
      */
-    static int calculate_signal_used(
+    __attribute__((unused)) static int calculate_signal_used(
         size_t signal_size,
         uint32_t sampling_frequency,
         float frame_length,
diff --git a/edgeimpulse/edge-impulse-sdk/porting/ei_classifier_porting.h b/edgeimpulse/edge-impulse-sdk/porting/ei_classifier_porting.h
index cb82adf..58ae129 100644
--- a/edgeimpulse/edge-impulse-sdk/porting/ei_classifier_porting.h
+++ b/edgeimpulse/edge-impulse-sdk/porting/ei_classifier_porting.h
@@ -20,39 +20,12 @@
 
 #include <stdint.h>
 #include <stdlib.h>
+#include "edge-impulse-sdk/dsp/returntypes.h"
 
 #if defined(__cplusplus) && EI_C_LINKAGE == 1
 extern "C" {
 #endif // defined(__cplusplus)
 
-typedef enum {
-    EI_IMPULSE_OK = 0,
-    EI_IMPULSE_ERROR_SHAPES_DONT_MATCH = -1,
-    EI_IMPULSE_CANCELED = -2,
-    EI_IMPULSE_TFLITE_ERROR = -3,
-    EI_IMPULSE_DSP_ERROR = -5,
-    EI_IMPULSE_TFLITE_ARENA_ALLOC_FAILED = -6,
-    EI_IMPULSE_CUBEAI_ERROR = -7,
-    EI_IMPULSE_ALLOC_FAILED = -8,
-    EI_IMPULSE_ONLY_SUPPORTED_FOR_IMAGES = -9,
-    EI_IMPULSE_UNSUPPORTED_INFERENCING_ENGINE = -10,
-    EI_IMPULSE_OUT_OF_MEMORY = -11,
-    EI_IMPULSE_INPUT_TENSOR_WAS_NULL = -13,
-    EI_IMPULSE_OUTPUT_TENSOR_WAS_NULL = -14,
-    EI_IMPULSE_SCORE_TENSOR_WAS_NULL = -15,
-    EI_IMPULSE_LABEL_TENSOR_WAS_NULL = -16,
-    EI_IMPULSE_TENSORRT_INIT_FAILED = -17,
-    EI_IMPULSE_DRPAI_INIT_FAILED = -18,
-    EI_IMPULSE_DRPAI_RUNTIME_FAILED = -19,
-    EI_IMPULSE_DEPRECATED_MODEL = -20,
-    EI_IMPULSE_LAST_LAYER_NOT_AVAILABLE = -21,
-    EI_IMPULSE_INFERENCE_ERROR = -22,
-    EI_IMPULSE_AKIDA_ERROR = -23,
-    EI_IMPULSE_INVALID_SIZE = -24,
-    EI_IMPULSE_ONNX_ERROR = -25,
-    EI_IMPULSE_MEMRYX_ERROR = -26,
-} EI_IMPULSE_ERROR;
-
 /**
  * Cancelable sleep, can be triggered with signal from other thread
  */
diff --git a/edgeimpulse/edge-impulse-sdk/porting/ei_logging.h b/edgeimpulse/edge-impulse-sdk/porting/ei_logging.h
index 37926c0..b69604b 100644
--- a/edgeimpulse/edge-impulse-sdk/porting/ei_logging.h
+++ b/edgeimpulse/edge-impulse-sdk/porting/ei_logging.h
@@ -39,13 +39,9 @@
 #define EI_LOGD(format, ...) (void)0
 
 #ifndef EI_LOG_LEVEL
-    #define EI_LOG_LEVEL EI_LOG_LEVEL_NONE
+    #define EI_LOG_LEVEL EI_LOG_LEVEL_INFO
 #endif
 
-#if defined(__cplusplus) && EI_C_LINKAGE == 1
-extern "C"
-#endif // defined(__cplusplus) && EI_C_LINKAGE == 1
-
 __attribute__((unused)) static const char *debug_msgs[] =
 {
     "NONE", // this one will never show
diff --git a/edgeimpulse/edge-impulse-sdk/tensorflow/lite/micro/kernels/softmax.cpp b/edgeimpulse/edge-impulse-sdk/tensorflow/lite/micro/kernels/softmax.cpp
index 97be376..75fd8aa 100644
--- a/edgeimpulse/edge-impulse-sdk/tensorflow/lite/micro/kernels/softmax.cpp
+++ b/edgeimpulse/edge-impulse-sdk/tensorflow/lite/micro/kernels/softmax.cpp
@@ -298,7 +298,7 @@ void SoftmaxQuantized(TfLiteContext* context, const TfLiteEvalTensor* input,
 #if EI_TFLITE_DISABLE_SOFTMAX_IN_I8
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
                       TfLiteTypeGetName(input->type), input->type);
-      return kTfLiteError;
+      return;
 #endif
     if (output->type == kTfLiteInt16) {
 #if EI_TFLITE_DISABLE_SOFTMAX_OUT_I16
diff --git a/edgeimpulse/edge-impulse-sdk/tensorflow/lite/micro/micro_allocator.cpp b/edgeimpulse/edge-impulse-sdk/tensorflow/lite/micro/micro_allocator.cpp
index e293283..8b4e575 100644
--- a/edgeimpulse/edge-impulse-sdk/tensorflow/lite/micro/micro_allocator.cpp
+++ b/edgeimpulse/edge-impulse-sdk/tensorflow/lite/micro/micro_allocator.cpp
@@ -208,7 +208,7 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
 
   result->data.data = GetFlatbufferTensorBuffer(flatbuffer_tensor, buffers);
   // this is useful for debugging
-#ifdef EI_LOG_LEVEL && EI_LOG_LEVEL >= 4
+#if EI_LOG_LEVEL && EI_LOG_LEVEL >= 4
   result->name = flatbuffer_tensor.name()->c_str();
 #endif
   // TODO(petewarden): Some of these paths aren't getting enough testing