From 86608c6770cf08c138a2bdab5855072f64be09ef Mon Sep 17 00:00:00 2001 From: joshua Date: Sat, 30 Dec 2023 23:54:31 -0500 Subject: initial commit --- .../NN/Source/PoolingFunctions/arm_pool_q7_HWC.c | 460 +++++++++++++++++++++ 1 file changed, 460 insertions(+) create mode 100644 Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c (limited to 'Drivers/CMSIS/NN/Source/PoolingFunctions') diff --git a/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c new file mode 100644 index 0000000..71fb771 --- /dev/null +++ b/Drivers/CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c @@ -0,0 +1,460 @@ +/* + * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_pool_q7_HWC.c + * Description: Pooling function implementations + * + * $Date: 17. January 2018 + * $Revision: V.1.0.0 + * + * Target Processor: Cortex-M cores + * + * -------------------------------------------------------------------- */ + +#include "arm_math.h" +#include "arm_nnfunctions.h" + +#if defined (ARM_MATH_DSP) + +/** + * @brief A few utility functions used by pooling functions + * + * + */ + +static void buffer_scale_back_q15_to_q7(q15_t * buffer, q7_t * target, uint16_t length, uint16_t scale) +{ + int i; + + for (i = 0; i < length; i++) + { + target[i] = (q7_t) (buffer[i] / scale); + } +} + +static void compare_and_replace_if_larger_q7(q7_t * base, // base data + const q7_t * target, // compare target + const uint16_t length // data length + ) +{ + q7_t *pIn = base; + const q7_t *pCom = target; + union arm_nnword in; + union arm_nnword com; + uint16_t cnt = length >> 2; + + while (cnt > 0u) + { + in.word = *__SIMD32(pIn); + com.word = *__SIMD32(pCom)++; + + // if version + if (com.bytes[0] > in.bytes[0]) + in.bytes[0] = com.bytes[0]; + if (com.bytes[1] > in.bytes[1]) + in.bytes[1] = com.bytes[1]; + if (com.bytes[2] > in.bytes[2]) + in.bytes[2] = com.bytes[2]; + if (com.bytes[3] > in.bytes[3]) + in.bytes[3] = com.bytes[3]; + + *__SIMD32(pIn)++ = in.word; + + cnt--; + } + + cnt = length & 0x3; + while (cnt > 0u) + { + if (*pCom > *pIn) + { + *pIn = *pCom; + } + pIn++; + pCom++; + cnt--; + } +} + +static void accumulate_q7_to_q15(q15_t * base, q7_t * target, const uint16_t length) +{ + q15_t *pCnt = base; + q7_t *pV = target; + q31_t v1, v2, vo1, vo2; + uint16_t cnt = length >> 2; + q31_t in; + + while (cnt > 0u) + { + q31_t value = *__SIMD32(pV)++; + v1 = __SXTB16(__ROR(value, 8)); + v2 = __SXTB16(value); +#ifndef ARM_MATH_BIG_ENDIAN + + vo2 = __PKHTB(v1, v2, 16); + vo1 = __PKHBT(v2, v1, 16); + +#else + + vo1 = __PKHTB(v1, v2, 16); + vo2 = __PKHBT(v2, v1, 16); + +#endif + + in = *__SIMD32(pCnt); + *__SIMD32(pCnt)++ = __QADD16(vo1, in); + + in = *__SIMD32(pCnt); + *__SIMD32(pCnt)++ = __QADD16(vo2, in); + + cnt--; + } + cnt = length & 0x3; + while (cnt > 0u) + { + *pCnt++ += *pV++; + cnt--; + } +} + +#endif // ARM_MATH_DSP + +/** + * @ingroup groupNN + */ + +/** + * @addtogroup Pooling + * @{ + */ + + /** + * @brief Q7 max pooling function + * @param[in, out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + * @details + * + * Buffer size: + * + * bufferA size: 0 + * + * The pooling function is implemented as split x-pooling then + * y-pooling. + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void +arm_maxpool_q7_HWC(q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out) +{ + +#if defined (ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + int16_t i_x, i_y; + + /* first does the pooling along x axis */ + for (i_y = 0; i_y < dim_im_in; i_y++) + { + + for (i_x = 0; i_x < dim_im_out; i_x++) + { + /* for each output pixel */ + q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; + q7_t *win_start; + q7_t *win_stop; + if (i_x * stride - padding < 0) + { + win_start = target; + } else + { + win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; + } + + if (i_x * stride - padding + dim_kernel >= dim_im_in) + { + win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; + } else + { + win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; + } + + /* first step is to copy over initial data */ + /* arm_copy_q7(win_start, target, ch_im_in); */ + memmove(target, win_start, ch_im_in); + + /* start the max operation from the second part */ + win_start += ch_im_in; + for (; win_start < win_stop; win_start += ch_im_in) + { + compare_and_replace_if_larger_q7(target, win_start, ch_im_in); + } + } + } + + /* then does the pooling along y axis */ + for (i_y = 0; i_y < dim_im_out; i_y++) + { + + /* for each output row */ + q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; + q7_t *row_start; + q7_t *row_end; + /* setting the starting row */ + if (i_y * stride - padding < 0) + { + row_start = Im_in; + } else + { + row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; + } + /* setting the stopping row */ + if (i_y * stride - padding + dim_kernel >= dim_im_in) + { + row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; + } else + { + row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; + } + + /* copy over the first row */ + /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */ + memmove(target, row_start, dim_im_out * ch_im_in); + + /* move over to next row */ + row_start += ch_im_in * dim_im_in; + + for (; row_start < row_end; row_start += dim_im_in * ch_im_in) + { + compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in); + } + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out; i_y++) + { + for (i_x = 0; i_x < dim_im_out; i_x++) + { + int max = -129; + for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) + { + for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) + { + if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max) + { + max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; + } + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max; + } + } + } + +#endif /* ARM_MATH_DSP */ + +} + + /** + * @brief Q7 average pooling function + * @param[in,out] Im_in pointer to input tensor + * @param[in] dim_im_in input tensor dimention + * @param[in] ch_im_in number of input tensor channels + * @param[in] dim_kernel filter kernel size + * @param[in] padding padding sizes + * @param[in] stride convolution stride + * @param[in] dim_im_out output tensor dimension + * @param[in,out] bufferA pointer to buffer space for input + * @param[in,out] Im_out pointer to output tensor + * @return none. + * + * @details + * + * Buffer size: + * + * bufferA size: 2*dim_im_out*ch_im_in + * + * The pooling function is implemented as split x-pooling then + * y-pooling. + * + * This pooling function is input-destructive. Input data is undefined + * after calling this function. + * + */ + +void +arm_avepool_q7_HWC(q7_t * Im_in, + const uint16_t dim_im_in, + const uint16_t ch_im_in, + const uint16_t dim_kernel, + const uint16_t padding, + const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out) +{ + +#if defined (ARM_MATH_DSP) + /* Run the following code for Cortex-M4 and Cortex-M7 */ + + q15_t *buffer = (q15_t *) bufferA; + int16_t i_x, i_y; + int16_t count = 0; + + /* first does the pooling along x axis */ + for (i_y = 0; i_y < dim_im_in; i_y++) + { + + for (i_x = 0; i_x < dim_im_out; i_x++) + { + /* for each output pixel */ + q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in; + q7_t *win_start; + q7_t *win_stop; + if (i_x * stride - padding < 0) + { + win_start = target; + } else + { + win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in; + } + + if (i_x * stride - padding + dim_kernel >= dim_im_in) + { + win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in; + } else + { + win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in; + } + + /* first step is to copy over initial data */ + arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in); + count = 1; + + /* start the max operation from the second part */ + win_start += ch_im_in; + for (; win_start < win_stop; win_start += ch_im_in) + { + accumulate_q7_to_q15(buffer, win_start, ch_im_in); + count++; + } + buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count); + } + } + + /* then does the pooling along y axis */ + for (i_y = 0; i_y < dim_im_out; i_y++) + { + /* for each output row */ + q7_t *target = Im_out + i_y * dim_im_out * ch_im_in; + q7_t *row_start; + q7_t *row_end; + /* setting the starting row */ + if (i_y * stride - padding < 0) + { + row_start = Im_in; + } else + { + row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in; + } + /* setting the stopping row */ + if (i_y * stride - padding + dim_kernel >= dim_im_in) + { + row_end = Im_in + dim_im_in * dim_im_in * ch_im_in; + } else + { + row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in; + } + + /* copy over the first row */ + arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in); + count = 1; + + /* move over to next row */ + row_start += ch_im_in * dim_im_in; + + for (; row_start < row_end; row_start += dim_im_in * ch_im_in) + { + accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in); + count++; + } + buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count); + } + +#else + /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */ + + int16_t i_ch_in, i_x, i_y; + int16_t k_x, k_y; + + for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) + { + for (i_y = 0; i_y < dim_im_out; i_y++) + { + for (i_x = 0; i_x < dim_im_out; i_x++) + { + int sum = 0; + int count = 0; + for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++) + { + for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++) + { + if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) + { + sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)]; + count++; + } + } + } + Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count; + } + } + } + +#endif /* ARM_MATH_DSP */ + +} + +/** + * @} end of Pooling group + */ -- cgit v1.2.3