|
| 1 | +/******************************************************************************* |
| 2 | +* Copyright (c) 2018-2024 Cadence Design Systems, Inc. |
| 3 | +* |
| 4 | +* Permission is hereby granted, free of charge, to any person obtaining |
| 5 | +* a copy of this software and associated documentation files (the |
| 6 | +* "Software"), to use this Software with Cadence processor cores only and |
| 7 | +* not with any other processors and platforms, subject to |
| 8 | +* the following conditions: |
| 9 | +* |
| 10 | +* The above copyright notice and this permission notice shall be included |
| 11 | +* in all copies or substantial portions of the Software. |
| 12 | +* |
| 13 | +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| 14 | +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
| 15 | +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
| 16 | +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
| 17 | +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
| 18 | +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
| 19 | +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| 20 | +
|
| 21 | +******************************************************************************/ |
| 22 | +#include "xa_nnlib_common.h" |
| 23 | + |
| 24 | +#include <string.h> |
| 25 | + |
| 26 | +/* |
| 27 | + * Currently only supports upto 5D input tensors. |
| 28 | + * 1/2/3/4 D input tensors will be scaled up to 5D. |
| 29 | + * For example, 2x3 -> 1x1x1x2x3. |
| 30 | + */ |
| 31 | + |
| 32 | +WORD32 xa_nn_transpose_8_8(WORD8 * __restrict__ p_out |
| 33 | + ,const WORD32 *const p_out_shape |
| 34 | + ,const WORD8 * __restrict__ p_inp |
| 35 | + ,const WORD32 *const p_inp_shape |
| 36 | + ,const WORD32 * __restrict__ p_permute_vec |
| 37 | + ,WORD32 num_out_dims |
| 38 | + ,WORD32 num_inp_dims) |
| 39 | +{ |
| 40 | + /* NULL pointer checks */ |
| 41 | + XA_NNLIB_ARG_CHK_PTR(p_out, -1); |
| 42 | + XA_NNLIB_ARG_CHK_PTR(p_inp, -1); |
| 43 | + XA_NNLIB_ARG_CHK_PTR(p_permute_vec, -1); |
| 44 | + XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); |
| 45 | + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, -1); |
| 46 | + |
| 47 | + /* Invalid input checks */ |
| 48 | + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > 5)), -1); |
| 49 | + XA_NNLIB_ARG_CHK_COND((num_out_dims != num_inp_dims), -1); |
| 50 | + |
| 51 | + int itr = 0; |
| 52 | + for(itr=0; itr < num_inp_dims; itr++) |
| 53 | + { |
| 54 | + XA_NNLIB_ARG_CHK_COND((p_inp_shape[itr] <= 0), -1); |
| 55 | + } |
| 56 | + for(itr=0; itr < num_out_dims; itr++) |
| 57 | + { |
| 58 | + XA_NNLIB_ARG_CHK_COND((p_out_shape[itr] <= 0), -1); |
| 59 | + } |
| 60 | + |
| 61 | + /* Output shape provided must be correct based on input |
| 62 | + * shape and permute values */ |
| 63 | + for(itr=0; itr < num_out_dims; itr++) |
| 64 | + { |
| 65 | + int output_dim = p_out_shape[itr]; |
| 66 | + int expected_dim = p_inp_shape[p_permute_vec[itr]]; |
| 67 | + XA_NNLIB_ARG_CHK_COND((output_dim != expected_dim), -1); |
| 68 | + } |
| 69 | + |
| 70 | + /* Pointer alignment checks */ |
| 71 | + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD8), -1); |
| 72 | + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD8), -1); |
| 73 | + XA_NNLIB_ARG_CHK_ALIGN(p_permute_vec, sizeof(WORD32), -1); |
| 74 | + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); |
| 75 | + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), -1); |
| 76 | + |
| 77 | + /* Shift all dim with 1 in the outer part */ |
| 78 | + int eff_output_shape[5]; |
| 79 | + int eff_permute_vec[5]; |
| 80 | + |
| 81 | + for(int i = 0; i < num_out_dims; i++) |
| 82 | + { |
| 83 | + eff_output_shape[i] = p_out_shape[i]; |
| 84 | + eff_permute_vec[i] = p_permute_vec[i]; |
| 85 | + } |
| 86 | + |
| 87 | + int one_i=num_out_dims-1, non_one_i=num_out_dims-1; |
| 88 | + while(one_i > 0 && non_one_i >=0){ |
| 89 | + while(one_i > 0 && eff_output_shape[one_i]!=1){ |
| 90 | + one_i--; |
| 91 | + } |
| 92 | + non_one_i = one_i; |
| 93 | + while(non_one_i >= 0 && eff_output_shape[non_one_i]==1) |
| 94 | + { |
| 95 | + non_one_i--; |
| 96 | + } |
| 97 | + if(one_i > 0 && non_one_i >=0){ |
| 98 | + int temp; |
| 99 | + /*swap output_shape*/ |
| 100 | + { |
| 101 | + temp = eff_output_shape[one_i]; |
| 102 | + eff_output_shape[one_i] = eff_output_shape[non_one_i]; |
| 103 | + eff_output_shape[non_one_i] = temp; |
| 104 | + } |
| 105 | + /*swap permute_vec*/ |
| 106 | + { |
| 107 | + temp = eff_permute_vec[one_i]; |
| 108 | + eff_permute_vec[one_i] = eff_permute_vec[non_one_i]; |
| 109 | + eff_permute_vec[non_one_i] = temp; |
| 110 | + } |
| 111 | + |
| 112 | + } |
| 113 | + } |
| 114 | + |
| 115 | + |
| 116 | + /* Promoting lesser dim tensors to 5D tensors. |
| 117 | + * Also updating the permute_vec and shapes as needed for optimization */ |
| 118 | + int p_5D_inp_shape[5] = {1, 1, 1, 1, 1}; |
| 119 | + int p_5D_out_shape[5] = {1, 1, 1, 1, 1}; |
| 120 | + int p_5D_permute_vec[5] = {0, 1, 2, 3, 4}; |
| 121 | + |
| 122 | + /* Check if any inner inp dimension is same in the output */ |
| 123 | + int last_dim_same = 1, last_n_same_dim = 0; |
| 124 | + itr = num_inp_dims - 1; |
| 125 | + while(itr >= 0) |
| 126 | + { |
| 127 | + last_n_same_dim = (last_dim_same && (eff_permute_vec[itr] == itr)) ? (last_n_same_dim + 1) : last_n_same_dim; |
| 128 | + last_dim_same = (eff_permute_vec[itr] == itr) ? last_dim_same & 1 : last_dim_same & 0; |
| 129 | + itr--; |
| 130 | + } |
| 131 | + |
| 132 | + int dims_added = 5 - num_inp_dims; |
| 133 | + itr = num_inp_dims - 1; |
| 134 | + int same_count = last_n_same_dim; |
| 135 | + int count = 4; |
| 136 | + while(itr >= 0) |
| 137 | + { |
| 138 | + p_5D_inp_shape[count] = (same_count > 0) ? p_5D_inp_shape[count]*p_inp_shape[itr] : p_inp_shape[itr]; |
| 139 | + p_5D_out_shape[count] = (same_count > 0) ? p_5D_out_shape[count]*eff_output_shape[itr] : eff_output_shape[itr]; |
| 140 | + same_count--; |
| 141 | + itr--; |
| 142 | + count = (same_count > 0) ? count : count - 1; |
| 143 | + } |
| 144 | + |
| 145 | + itr = num_inp_dims - 1; |
| 146 | + same_count = (last_n_same_dim) ? num_inp_dims - (last_n_same_dim - 1) : 0; |
| 147 | + count = 4; |
| 148 | + while(itr >= 0) |
| 149 | + { |
| 150 | + p_5D_permute_vec[count] = (same_count > 0) ? eff_permute_vec[itr-(last_n_same_dim - 1)] + dims_added + last_n_same_dim - 1 : eff_permute_vec[itr] + dims_added; |
| 151 | + same_count--; |
| 152 | + itr--; |
| 153 | + count--; |
| 154 | + } |
| 155 | + |
| 156 | + int out_dim0, out_dim1, out_dim2, out_dim3, out_dim4; |
| 157 | + int inp_dim1, inp_dim2, inp_dim3, inp_dim4; |
| 158 | + int inp_stride[5]; |
| 159 | + |
| 160 | + out_dim0 = p_5D_out_shape[0]; |
| 161 | + out_dim1 = p_5D_out_shape[1]; |
| 162 | + out_dim2 = p_5D_out_shape[2]; |
| 163 | + out_dim3 = p_5D_out_shape[3]; |
| 164 | + out_dim4 = p_5D_out_shape[4]; |
| 165 | + |
| 166 | + inp_dim1 = p_5D_inp_shape[1]; |
| 167 | + inp_dim2 = p_5D_inp_shape[2]; |
| 168 | + inp_dim3 = p_5D_inp_shape[3]; |
| 169 | + inp_dim4 = p_5D_inp_shape[4]; |
| 170 | + |
| 171 | + inp_stride[0] = inp_dim1*inp_dim2*inp_dim3*inp_dim4; |
| 172 | + inp_stride[1] = inp_dim2*inp_dim3*inp_dim4; |
| 173 | + inp_stride[2] = inp_dim3*inp_dim4; |
| 174 | + inp_stride[3] = inp_dim4; |
| 175 | + inp_stride[4] = 1; |
| 176 | + |
| 177 | + if(last_n_same_dim) |
| 178 | + { |
| 179 | + int itr0, itr1, itr2, itr3; |
| 180 | + WORD8 *p_inp0 = (WORD8*)p_inp; |
| 181 | + for(itr0 = 0; itr0 < out_dim0; itr0++) |
| 182 | + { |
| 183 | + WORD8 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]); |
| 184 | +#pragma loop_count min=1 |
| 185 | + for(itr1 = 0; itr1 < out_dim1; itr1++) |
| 186 | + { |
| 187 | + WORD8 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]); |
| 188 | +#pragma loop_count min=1 |
| 189 | + for(itr2 = 0; itr2 < out_dim2; itr2++) |
| 190 | + { |
| 191 | + WORD8 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]); |
| 192 | +#pragma loop_count min=1 |
| 193 | + for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4) |
| 194 | + { |
| 195 | + WORD8 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]); |
| 196 | + memcpy(p_out, p_inp4, out_dim4); |
| 197 | + } |
| 198 | + } |
| 199 | + } |
| 200 | + } |
| 201 | + } |
| 202 | + else |
| 203 | + { |
| 204 | + int itr0, itr1, itr2, itr3, itr4; |
| 205 | + WORD8 *p_inp0 = (WORD8*)p_inp; |
| 206 | + for(itr0 = 0; itr0 < out_dim0; itr0++) |
| 207 | + { |
| 208 | + WORD8 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]); |
| 209 | + for(itr1 = 0; itr1 < out_dim1; itr1++) |
| 210 | + { |
| 211 | + WORD8 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]); |
| 212 | + for(itr2 = 0; itr2 < out_dim2; itr2++) |
| 213 | + { |
| 214 | + WORD8 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]); |
| 215 | + for(itr3 = 0; itr3 < out_dim3; itr3++) |
| 216 | + { |
| 217 | + WORD8 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]); |
| 218 | + for(itr4 = 0; itr4 < out_dim4; itr4++) |
| 219 | + { |
| 220 | + WORD8 d0 = *(p_inp4); |
| 221 | + p_inp4 += inp_stride[p_5D_permute_vec[4]]; |
| 222 | + *p_out++ = d0; |
| 223 | + |
| 224 | + } |
| 225 | + } |
| 226 | + } |
| 227 | + } |
| 228 | + } |
| 229 | + } |
| 230 | + |
| 231 | + return 0; |
| 232 | +} |
0 commit comments