initial commitHEAD master

author: joshua <joshua@joshuayun.com> 2023-12-30 23:54:31 -0500
committer: joshua <joshua@joshuayun.com> 2023-12-30 23:54:31 -0500
commit: 86608c6770cf08c138a2bdab5855072f64be09ef (patch)
tree: 494a61b3ef37e76f9235a0d10f5c93d97290a35f /Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src
download: sdr-software-86608c6770cf08c138a2bdab5855072f64be09ef.tar.gz
67 files changed, 8229 insertions, 0 deletions
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/BasicMathFunctions.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/BasicMathFunctions.c
new file mode 100644
index 0000000..2bb5d8b
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/BasicMathFunctions.c
@@ -0,0 +1,11 @@
+
+#include "abs.c"
+#include "add.c"
+#include "dot_prod.c"
+#include "mult.c"
+#include "negate.c"
+#include "offset.c"
+#include "scale.c"
+#include "shift.c"
+#include "sub.c"
+
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/abs.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/abs.c
new file mode 100644
index 0000000..baca23f
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/abs.c
@@ -0,0 +1,53 @@
+#include "ref.h"
+
+void ref_abs_f32(
+  float32_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = pSrc[i] < 0 ? -pSrc[i] : pSrc[i];
+	}
+}
+
+void ref_abs_q31(
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = pSrc[i] < 0 ? -pSrc[i] : pSrc[i];
+	}
+}
+
+void ref_abs_q15(
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = pSrc[i] < 0 ? -pSrc[i] : pSrc[i];
+	}
+}
+
+void ref_abs_q7(
+  q7_t * pSrc,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = pSrc[i] < 0 ? -pSrc[i] : pSrc[i];
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/add.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/add.c
new file mode 100644
index 0000000..489c8a0
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/add.c
@@ -0,0 +1,57 @@
+#include "ref.h"
+
+void ref_add_f32(
+  float32_t * pSrcA,
+  float32_t * pSrcB,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = pSrcA[i] + pSrcB[i];
+	}
+}
+
+void ref_add_q31(
+  q31_t * pSrcA,
+  q31_t * pSrcB,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ref_sat_q31( (q63_t)pSrcA[i] + pSrcB[i] );
+	}
+}
+
+void ref_add_q15(
+  q15_t * pSrcA,
+  q15_t * pSrcB,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ref_sat_q15( (q31_t)pSrcA[i] + pSrcB[i] );
+	}
+}
+
+void ref_add_q7(
+  q7_t * pSrcA,
+  q7_t * pSrcB,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ref_sat_q7( (q15_t)pSrcA[i] + pSrcB[i] );
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/dot_prod.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/dot_prod.c
new file mode 100644
index 0000000..08f6178
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/dot_prod.c
@@ -0,0 +1,65 @@
+#include "ref.h"
+
+void ref_dot_prod_f32(
+  float32_t * pSrcA,
+  float32_t * pSrcB,
+  uint32_t blockSize,
+  float32_t * result)
+{
+	uint32_t i;
+  float32_t sum = 0.0f;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		sum += pSrcA[i] * pSrcB[i];
+	}
+  *result = sum;
+}
+
+void ref_dot_prod_q31(
+  q31_t * pSrcA,
+  q31_t * pSrcB,
+  uint32_t blockSize,
+  q63_t * result)
+{
+	uint32_t i;
+  q63_t sum = 0.0f;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		sum += ((q63_t)pSrcA[i] * pSrcB[i]) >> 14; //16.48
+	}
+  *result = sum;
+}
+
+void ref_dot_prod_q15(
+  q15_t * pSrcA,
+  q15_t * pSrcB,
+  uint32_t blockSize,
+  q63_t * result)
+{
+	uint32_t i;
+  q63_t sum = 0.0f;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		sum += (q31_t)pSrcA[i] * pSrcB[i]; //34.30
+	}
+  *result = sum;
+}
+
+void ref_dot_prod_q7(
+  q7_t * pSrcA,
+  q7_t * pSrcB,
+  uint32_t blockSize,
+  q31_t * result)
+{
+	uint32_t i;
+  q31_t sum = 0.0f;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		sum += (q31_t)pSrcA[i] * pSrcB[i]; //18.14
+	}
+  *result = sum;
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/mult.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/mult.c
new file mode 100644
index 0000000..a77c870
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/mult.c
@@ -0,0 +1,64 @@
+#include "ref.h"
+
+void ref_mult_f32(
+  float32_t * pSrcA,
+  float32_t * pSrcB,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = pSrcA[i] * pSrcB[i];
+	}
+}
+
+void ref_mult_q31(
+  q31_t * pSrcA,
+  q31_t * pSrcB,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	q63_t temp;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		temp = ((q63_t)pSrcA[i] * pSrcB[i]) >> 32;
+		temp = temp << 1;
+		pDst[i] = ref_sat_q31(temp);
+	}
+}
+
+void ref_mult_q15(
+  q15_t * pSrcA,
+  q15_t * pSrcB,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	q31_t temp;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		temp = ((q31_t)pSrcA[i] * pSrcB[i]) >> 15; //this comment is for JD, this is specifically 15 and not 16 like the q31 case might imply.  This is because CMSIS DSP lib does it this way.  No other reason.
+		pDst[i] = ref_sat_q15(temp);
+	}
+}
+
+void ref_mult_q7(
+  q7_t * pSrcA,
+  q7_t * pSrcB,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	q15_t temp;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		temp = ((q15_t)pSrcA[i] * pSrcB[i]) >> 7;
+		pDst[i] = ref_sat_q7(temp);
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/negate.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/negate.c
new file mode 100644
index 0000000..192da1b
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/negate.c
@@ -0,0 +1,53 @@
+#include "ref.h"
+
+void ref_negate_f32(
+  float32_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = -pSrc[i];
+	}
+}
+
+void ref_negate_q31(
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = -pSrc[i];
+	}
+}
+
+void ref_negate_q15(
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = -pSrc[i];
+	}
+}
+
+void ref_negate_q7(
+  q7_t * pSrc,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = -pSrc[i];
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/offset.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/offset.c
new file mode 100644
index 0000000..b076e75
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/offset.c
@@ -0,0 +1,57 @@
+#include "ref.h"
+
+void ref_offset_f32(
+  float32_t * pSrc,
+  float32_t offset,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = pSrc[i] + offset;
+	}
+}
+
+void ref_offset_q31(
+  q31_t * pSrc,
+  q31_t offset,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ref_sat_q31( (q63_t)pSrc[i] + offset );
+	}
+}
+
+void ref_offset_q15(
+  q15_t * pSrc,
+  q15_t offset,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ref_sat_q15( (q31_t)pSrc[i] + offset );
+	}
+}
+
+void ref_offset_q7(
+  q7_t * pSrc,
+  q7_t offset,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ref_sat_q7( (q15_t)pSrc[i] + offset );
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/scale.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/scale.c
new file mode 100644
index 0000000..5ab655c
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/scale.c
@@ -0,0 +1,69 @@
+#include "ref.h"
+
+void ref_scale_f32(
+  float32_t * pSrc,
+  float32_t scale,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = pSrc[i] * scale;
+	}
+}
+
+void ref_scale_q31(
+  q31_t * pSrc,
+  q31_t scaleFract,
+  int8_t shift,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+  int8_t kShift = shift + 1;                     /* Shift to apply after scaling */
+  int8_t sign = (kShift & 0x80);
+	q63_t temp;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		temp = ((q63_t) pSrc[i] * scaleFract) >> 32;
+		if (sign)
+			pDst[i] = temp >> -kShift;
+		else
+			pDst[i] = ref_sat_q31( (q63_t)temp << kShift );
+	}
+}
+
+void ref_scale_q15(
+  q15_t * pSrc,
+  q15_t scaleFract,
+  int8_t shift,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+  int8_t kShift = 15 - shift;                      /* Shift to apply after scaling */
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ref_sat_q15(((q31_t) pSrc[i] * scaleFract) >> kShift);
+	}
+}
+
+void ref_scale_q7(
+  q7_t * pSrc,
+  q7_t scaleFract,
+  int8_t shift,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+  int8_t kShift = 7 - shift;                      /* Shift to apply after scaling */
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ref_sat_q7(((q15_t) pSrc[i] * scaleFract) >> kShift);
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/shift.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/shift.c
new file mode 100644
index 0000000..3bc53ad
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/shift.c
@@ -0,0 +1,73 @@
+#include "ref.h"
+
+void ref_shift_q31(
+  q31_t * pSrc,
+  int8_t shiftBits,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	if (shiftBits < 0)
+	{
+		for(i=0;i<blockSize;i++)
+		{
+			pDst[i] = pSrc[i] << shiftBits;
+		}
+	}
+	else
+	{
+		for(i=0;i<blockSize;i++)
+		{
+			pDst[i] = pSrc[i] >> -shiftBits;
+		}
+	}
+}
+
+void ref_shift_q15(
+  q15_t * pSrc,
+  int8_t shiftBits,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	if (shiftBits < 0)
+	{
+		for(i=0;i<blockSize;i++)
+		{
+			pDst[i] = pSrc[i] << shiftBits;
+		}
+	}
+	else
+	{
+		for(i=0;i<blockSize;i++)
+		{
+			pDst[i] = pSrc[i] >> -shiftBits;
+		}
+	}
+}
+
+void ref_shift_q7(
+  q7_t * pSrc,
+  int8_t shiftBits,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	if (shiftBits < 0)
+	{
+		for(i=0;i<blockSize;i++)
+		{
+			pDst[i] = pSrc[i] << shiftBits;
+		}
+	}
+	else
+	{
+		for(i=0;i<blockSize;i++)
+		{
+			pDst[i] = pSrc[i] >> -shiftBits;
+		}
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/sub.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/sub.c
new file mode 100644
index 0000000..da89e95
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/BasicMathFunctions/sub.c
@@ -0,0 +1,57 @@
+#include "ref.h"
+
+void ref_sub_f32(
+  float32_t * pSrcA,
+  float32_t * pSrcB,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = pSrcA[i] - pSrcB[i];
+	}
+}
+
+void ref_sub_q31(
+  q31_t * pSrcA,
+  q31_t * pSrcB,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ref_sat_q31( (q63_t)pSrcA[i] - pSrcB[i] );
+	}
+}
+
+void ref_sub_q15(
+  q15_t * pSrcA,
+  q15_t * pSrcB,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ref_sat_q15( (q31_t)pSrcA[i] - pSrcB[i] );
+	}
+}
+
+void ref_sub_q7(
+  q7_t * pSrcA,
+  q7_t * pSrcB,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ref_sat_q7( (q15_t)pSrcA[i] - pSrcB[i] );
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/ComplexMathFunctions.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/ComplexMathFunctions.c
new file mode 100644
index 0000000..d28aad8
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/ComplexMathFunctions.c
@@ -0,0 +1,8 @@
+
+#include "cmplx_conj.c"
+#include "cmplx_dot_prod.c"
+#include "cmplx_mag.c"
+#include "cmplx_mag_squared.c"
+#include "cmplx_mult_cmplx.c"
+#include "cmplx_mult_real.c"
+
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_conj.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_conj.c
new file mode 100644
index 0000000..6b80de0
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_conj.c
@@ -0,0 +1,40 @@
+#include "ref.h"
+
+void ref_cmplx_conj_f32(
+  float32_t * pSrc,
+  float32_t * pDst,
+  uint32_t numSamples)
+{
+	uint32_t i;
+	for(i=0;i<numSamples*2;i+=2)
+	{
+		pDst[i] = pSrc[i];
+		pDst[i+1] = -pSrc[i+1];
+	}
+}
+
+void ref_cmplx_conj_q31(
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t numSamples)
+{
+	uint32_t i;
+	for(i=0;i<numSamples*2;i+=2)
+	{
+		pDst[i] = pSrc[i];
+		pDst[i+1] = -pSrc[i+1];
+	}
+}
+
+void ref_cmplx_conj_q15(
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t numSamples)
+{
+	uint32_t i;
+	for(i=0;i<numSamples*2;i+=2)
+	{
+		pDst[i] = pSrc[i];
+		pDst[i+1] = -pSrc[i+1];
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_dot_prod.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_dot_prod.c
new file mode 100644
index 0000000..19b1e1c
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_dot_prod.c
@@ -0,0 +1,72 @@
+#include "ref.h"
+
+void ref_cmplx_dot_prod_f32(
+  float32_t * pSrcA,
+  float32_t * pSrcB,
+  uint32_t numSamples,
+  float32_t * realResult,
+  float32_t * imagResult)
+{
+	float32_t sumr, sumi;
+	uint32_t i;
+	
+	sumr = 0;
+	sumi = 0;
+	
+	for(i=0;i<numSamples*2;i+=2)
+	{
+		sumr += pSrcA[i] * pSrcB[i]   - pSrcA[i+1] * pSrcB[i+1];
+		sumi += pSrcA[i] * pSrcB[i+1] + pSrcA[i+1] * pSrcB[i];
+	}
+	
+	*realResult = sumr;
+	*imagResult = sumi;
+}
+
+void ref_cmplx_dot_prod_q31(
+  q31_t * pSrcA,
+  q31_t * pSrcB,
+  uint32_t numSamples,
+  q63_t * realResult,
+  q63_t * imagResult)
+{
+	q63_t sumr, sumi;
+	uint32_t i;
+	
+	sumr = 0;
+	sumi = 0;
+	
+	for(i=0;i<numSamples*2;i+=2)
+	{
+		//shifting down 14 here to provide guard bits
+		sumr += (((q63_t)pSrcA[i] * pSrcB[i]  ) >> 14) - (((q63_t)pSrcA[i+1] * pSrcB[i+1]) >> 14);
+		sumi += (((q63_t)pSrcA[i] * pSrcB[i+1]) >> 14) + (((q63_t)pSrcA[i+1] * pSrcB[i]  ) >> 14);
+	}
+	
+	*realResult = sumr;
+	*imagResult = sumi;
+}
+
+void ref_cmplx_dot_prod_q15(
+  q15_t * pSrcA,
+  q15_t * pSrcB,
+  uint32_t numSamples,
+  q31_t * realResult,
+  q31_t * imagResult)
+{
+	q63_t sumr, sumi;
+	uint32_t i;
+	
+	sumr = 0;
+	sumi = 0;
+	
+	for(i=0;i<numSamples*2;i+=2)
+	{
+		sumr += (q31_t)pSrcA[i] * pSrcB[i]   - (q31_t)pSrcA[i+1] * pSrcB[i+1];
+		sumi += (q31_t)pSrcA[i] * pSrcB[i+1] + (q31_t)pSrcA[i+1] * pSrcB[i];
+	}
+	
+	//shifting down 6 at the end here because there are already 32 guard bits available, this method is more accurate
+	*realResult = (q31_t)(sumr >> 6);
+	*imagResult = (q31_t)(sumi >> 6);
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_mag.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_mag.c
new file mode 100644
index 0000000..b5ac28d
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_mag.c
@@ -0,0 +1,49 @@
+#include "ref.h"
+
+void ref_cmplx_mag_f32(
+  float32_t * pSrc,
+  float32_t * pDst,
+  uint32_t numSamples)
+{
+	uint32_t i;
+	
+	for(i=0;i<numSamples*2;i+=2)
+	{
+		*pDst++ = sqrtf(pSrc[i] * pSrc[i] + pSrc[i+1] * pSrc[i+1]);
+	}
+}
+
+void ref_cmplx_mag_q31(
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t numSamples)
+{
+	uint32_t i;
+	q31_t acc0,acc1,out;
+	
+	for(i=0;i<numSamples*2;i+=2)
+	{
+		acc0 = (q31_t)(((q63_t)pSrc[i] * pSrc[i]) >> 33);
+		acc1 = (q31_t)(((q63_t)pSrc[i+1] * pSrc[i+1]) >> 33);
+		out = acc0 + acc1;
+		*pDst++ = (q31_t)(sqrtf((float)out / 2147483648.0f) * 2147483648.0f);
+	}
+}
+
+void ref_cmplx_mag_q15(
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t numSamples)
+{
+	uint32_t i;
+	q31_t acc0,acc1;
+	q15_t out;
+	
+	for(i=0;i<numSamples*2;i+=2)
+	{
+		acc0 = pSrc[i] * pSrc[i];
+		acc1 = pSrc[i+1] * pSrc[i+1];
+		out = (q15_t) (((q63_t) acc0 + acc1) >> 17);
+		*pDst++ = (q15_t)(sqrtf((float)out / 32768.0f) * 32768.0f);
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_mag_squared.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_mag_squared.c
new file mode 100644
index 0000000..aec7bd5
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_mag_squared.c
@@ -0,0 +1,46 @@
+#include "ref.h"
+
+void ref_cmplx_mag_squared_f32(
+  float32_t * pSrc,
+  float32_t * pDst,
+  uint32_t numSamples)
+{
+	uint32_t i;
+	
+	for(i=0;i<numSamples*2;i+=2)
+	{
+		*pDst++ = pSrc[i] * pSrc[i] + pSrc[i+1] * pSrc[i+1];
+	}
+}
+
+void ref_cmplx_mag_squared_q31(
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t numSamples)
+{
+	uint32_t i;
+	q31_t acc0,acc1;
+	
+	for(i=0;i<numSamples*2;i+=2)
+	{
+		acc0 = (q31_t)(((q63_t)pSrc[i] * pSrc[i]) >> 33);
+		acc1 = (q31_t)(((q63_t)pSrc[i+1] * pSrc[i+1]) >> 33);
+		*pDst++ = acc0 + acc1;
+	}
+}
+
+void ref_cmplx_mag_squared_q15(
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t numSamples)
+{
+	uint32_t i;
+	q31_t acc0,acc1;
+	
+	for(i=0;i<numSamples*2;i+=2)
+	{
+		acc0 = pSrc[i] * pSrc[i];
+		acc1 = pSrc[i+1] * pSrc[i+1];
+		*pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_mult_cmplx.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_mult_cmplx.c
new file mode 100644
index 0000000..c7a5409
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_mult_cmplx.c
@@ -0,0 +1,56 @@
+#include "ref.h"
+
+void ref_cmplx_mult_cmplx_f32(
+  float32_t * pSrcA,
+  float32_t * pSrcB,
+  float32_t * pDst,
+  uint32_t numSamples)
+{
+	uint32_t i;
+	
+	for(i=0;i<numSamples*2;i+=2)
+	{
+		pDst[i]   = pSrcA[i] * pSrcB[i]   - pSrcA[i+1] * pSrcB[i+1];
+		pDst[i+1] = pSrcA[i] * pSrcB[i+1] + pSrcA[i+1] * pSrcB[i];
+	}
+}
+
+void ref_cmplx_mult_cmplx_q31(
+  q31_t * pSrcA,
+  q31_t * pSrcB,
+  q31_t * pDst,
+  uint32_t numSamples)
+{
+	uint32_t i;
+	q31_t mul1, mul2, mul3, mul4;
+	
+	for(i=0;i<numSamples*2;i+=2)
+	{
+		mul1 = ((q63_t)pSrcA[i]   * pSrcB[i])   >> 33;
+		mul2 = ((q63_t)pSrcA[i+1] * pSrcB[i+1]) >> 33;
+		mul3 = ((q63_t)pSrcA[i]   * pSrcB[i+1]) >> 33;
+		mul4 = ((q63_t)pSrcA[i+1] * pSrcB[i])   >> 33;
+		pDst[i]   = mul1 - mul2;
+		pDst[i+1] = mul3 + mul4;
+	}
+}
+
+void ref_cmplx_mult_cmplx_q15(
+  q15_t * pSrcA,
+  q15_t * pSrcB,
+  q15_t * pDst,
+  uint32_t numSamples)
+{
+	uint32_t i;
+	q31_t mul1, mul2, mul3, mul4;
+	
+	for(i=0;i<numSamples*2;i+=2)
+	{
+		mul1 = ((q31_t)pSrcA[i]   * pSrcB[i])   >> 17;
+		mul2 = ((q31_t)pSrcA[i+1] * pSrcB[i+1]) >> 17;
+		mul3 = ((q31_t)pSrcA[i]   * pSrcB[i+1]) >> 17;
+		mul4 = ((q31_t)pSrcA[i+1] * pSrcB[i])   >> 17;
+		pDst[i]   = mul1 - mul2;
+		pDst[i+1] = mul3 + mul4;
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_mult_real.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_mult_real.c
new file mode 100644
index 0000000..dc4928e
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ComplexMathFunctions/cmplx_mult_real.c
@@ -0,0 +1,52 @@
+#include "ref.h"
+
+void ref_cmplx_mult_real_f32(
+  float32_t * pSrcCmplx,
+  float32_t * pSrcReal,
+  float32_t * pCmplxDst,
+  uint32_t numSamples)
+{
+	uint32_t i;
+	
+	for(i=0;i<numSamples;i++)
+	{
+		pCmplxDst[2*i+0] = pSrcCmplx[2*i+0] * pSrcReal[i];
+		pCmplxDst[2*i+1] = pSrcCmplx[2*i+1] * pSrcReal[i];
+	}
+}
+
+void ref_cmplx_mult_real_q31(
+  q31_t * pSrcCmplx,
+  q31_t * pSrcReal,
+  q31_t * pCmplxDst,
+  uint32_t numSamples)
+{
+	uint32_t i;
+	q31_t tempR, tempI;
+	
+	for(i=0;i<numSamples;i++)
+	{
+		tempR = ((q63_t) pSrcCmplx[2*i+0] * pSrcReal[i]) >> 32;
+		tempI = ((q63_t) pSrcCmplx[2*i+1] * pSrcReal[i]) >> 32;
+		pCmplxDst[2*i+0] = ref_sat_n(tempR, 31) << 1;
+		pCmplxDst[2*i+1] = ref_sat_n(tempI, 31) << 1;
+	}
+}
+
+void ref_cmplx_mult_real_q15(
+  q15_t * pSrcCmplx,
+  q15_t * pSrcReal,
+  q15_t * pCmplxDst,
+  uint32_t numSamples)
+{
+	uint32_t i;
+	q31_t tempR, tempI;
+	
+	for(i=0;i<numSamples;i++)
+	{
+		tempR = ((q31_t) pSrcCmplx[2*i+0] * pSrcReal[i]) >> 15;
+		tempI = ((q31_t) pSrcCmplx[2*i+1] * pSrcReal[i]) >> 15;
+		pCmplxDst[2*i+0] = ref_sat_q15(tempR);
+		pCmplxDst[2*i+1] = ref_sat_q15(tempI);
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ControllerFunctions/ControllerFunctions.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ControllerFunctions/ControllerFunctions.c
new file mode 100644
index 0000000..b26e912
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ControllerFunctions/ControllerFunctions.c
@@ -0,0 +1,4 @@
+
+#include "pid.c"
+#include "sin_cos.c"
+
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ControllerFunctions/pid.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ControllerFunctions/pid.c
new file mode 100644
index 0000000..51aa633
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ControllerFunctions/pid.c
@@ -0,0 +1,97 @@
+#include "ref.h"
+
+float32_t ref_pid_f32(
+	arm_pid_instance_f32 * S,
+	float32_t in)
+{
+	float32_t out;
+
+	/* y[n] = y[n-1] + A0 * x[n] + A1 * x[n-1] + A2 * x[n-2]  */
+	out = S->state[2] + S->A0 * in + S->A1 * S->state[0] + S->A2 * S->state[1];
+
+	/* Update state */
+	S->state[1] = S->state[0];
+	S->state[0] = in;
+	S->state[2] = out;
+
+	/* return to application */
+	return (out);
+}
+
+q31_t ref_pid_q31(
+	arm_pid_instance_q31 * S,
+	q31_t in)
+{
+	q63_t acc;
+	q31_t out;
+
+	/* acc = A0 * x[n]  */
+	acc = (q63_t) S->A0 * in;
+
+	/* acc += A1 * x[n-1] */
+	acc += (q63_t) S->A1 * S->state[0];
+
+	/* acc += A2 * x[n-2]  */
+	acc += (q63_t) S->A2 * S->state[1];
+
+	/* convert output to 1.31 format to add y[n-1] */
+	out = (q31_t) (acc >> 31U);
+
+	/* out += y[n-1] */
+	out += S->state[2];
+
+	/* Update state */
+	S->state[1] = S->state[0];
+	S->state[0] = in;
+	S->state[2] = out;
+
+	/* return to application */
+	return (out);
+}
+
+q15_t ref_pid_q15(
+	arm_pid_instance_q15 * S,
+	q15_t in)
+{
+	q63_t acc;
+	q15_t out;
+	q15_t A1, A2;
+	
+#if defined (ARM_MATH_DSP)
+   
+#ifndef  ARM_MATH_BIG_ENDIAN
+	A2 = S->A1 >> 16;
+	A1 = (q15_t)S->A1;	
+#else
+	A1 = S->A1 >> 16;
+	A2 = (q15_t)S->A1;	
+#endif
+   
+#else
+	
+	A1 = S->A1;
+	A2 = S->A2;
+	
+#endif	
+	
+	/* acc = A0 * x[n]  */
+	acc = ((q31_t) S->A0) * in;
+
+	/* acc += A1 * x[n-1] + A2 * x[n-2]  */
+	acc += (q31_t) A1 * S->state[0];
+	acc += (q31_t) A2 * S->state[1];
+
+	/* acc += y[n-1] */
+	acc += (q31_t) S->state[2] << 15;
+
+	/* saturate the output */
+	out = ref_sat_q15(acc >> 15);
+
+	/* Update state */
+	S->state[1] = S->state[0];
+	S->state[0] = in;
+	S->state[2] = out;
+
+	/* return to application */
+	return (out);
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ControllerFunctions/sin_cos.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ControllerFunctions/sin_cos.c
new file mode 100644
index 0000000..22c91a0
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/ControllerFunctions/sin_cos.c
@@ -0,0 +1,21 @@
+#include "ref.h"
+
+void ref_sin_cos_f32(
+  float32_t theta,
+  float32_t * pSinVal,
+  float32_t * pCosVal)
+{
+	//theta is given in degrees
+	*pSinVal = sinf(theta * 6.28318530717959f / 360.0f);
+	*pCosVal = cosf(theta * 6.28318530717959f / 360.0f);
+}
+
+void ref_sin_cos_q31(
+  q31_t theta,
+  q31_t * pSinVal,
+  q31_t * pCosVal)
+{
+	//theta is given in the range [-1,1) to represent [-pi,pi)
+	*pSinVal = (q31_t)(sinf((float32_t)theta * 3.14159265358979f / 2147483648.0f) * 2147483648.0f);
+	*pCosVal = (q31_t)(cosf((float32_t)theta * 3.14159265358979f / 2147483648.0f) * 2147483648.0f);
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FastMathFunctions/FastMathFunctions.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FastMathFunctions/FastMathFunctions.c
new file mode 100644
index 0000000..2ea0f98
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FastMathFunctions/FastMathFunctions.c
@@ -0,0 +1,5 @@
+
+#include "cos.c"
+#include "sin.c"
+#include "sqrt.c"
+
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FastMathFunctions/cos.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FastMathFunctions/cos.c
new file mode 100644
index 0000000..ab6c98e
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FastMathFunctions/cos.c
@@ -0,0 +1,11 @@
+#include "ref.h"
+
+q31_t ref_cos_q31(q31_t x)
+{
+	return (q31_t)(cosf((float32_t)x * 6.28318530717959f / 2147483648.0f) * 2147483648.0f);
+}
+
+q15_t ref_cos_q15(q15_t x)
+{
+	return (q15_t)(cosf((float32_t)x * 6.28318530717959f / 32768.0f) * 32768.0f);
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FastMathFunctions/sin.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FastMathFunctions/sin.c
new file mode 100644
index 0000000..3f303a5
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FastMathFunctions/sin.c
@@ -0,0 +1,11 @@
+#include "ref.h"
+
+q31_t ref_sin_q31(q31_t x)
+{
+	return (q31_t)(sinf((float32_t)x * 6.28318530717959f / 2147483648.0f) * 2147483648.0f);
+}
+
+q15_t ref_sin_q15(q15_t x)
+{
+	return (q15_t)(sinf((float32_t)x * 6.28318530717959f / 32768.0f) * 32768.0f);
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FastMathFunctions/sqrt.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FastMathFunctions/sqrt.c
new file mode 100644
index 0000000..9dc34af
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FastMathFunctions/sqrt.c
@@ -0,0 +1,15 @@
+#include "ref.h"
+
+arm_status ref_sqrt_q31(q31_t in, q31_t * pOut)
+{
+	*pOut = (q31_t)(sqrtf((float32_t)in / 2147483648.0f) * 2147483648.0f);
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_sqrt_q15(q15_t in, q15_t * pOut)
+{
+	*pOut = (q15_t)(sqrtf((float32_t)in / 32768.0f) * 32768.0f);
+	
+	return ARM_MATH_SUCCESS;
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/FilteringFunctions.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/FilteringFunctions.c
new file mode 100644
index 0000000..a1eeaaa
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/FilteringFunctions.c
@@ -0,0 +1,12 @@
+
+#include "biquad.c"
+#include "conv.c"
+#include "correlate.c"
+#include "fir.c"
+#include "fir_decimate.c"
+#include "fir_interpolate.c"
+#include "fir_lattice.c"
+#include "fir_sparse.c"
+#include "iir_lattice.c"
+#include "lms.c"
+
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/biquad.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/biquad.c
new file mode 100644
index 0000000..78634cd
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/biquad.c
@@ -0,0 +1,713 @@
+#include "ref.h"
+
+void ref_biquad_cascade_df2T_f32(
+	const arm_biquad_cascade_df2T_instance_f32 * S,
+	float32_t * pSrc,
+	float32_t * pDst,
+	uint32_t blockSize)
+{
+        float32_t *pIn = pSrc;                         /*  source pointer            */
+        float32_t *pOut = pDst;                        /*  destination pointer       */
+        float32_t *pState = S->pState;                 /*  State pointer             */
+  const float32_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
+        float32_t acc;                                 /*  accumulator               */
+        float32_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
+        float32_t Xn;                                  /*  temporary input           */
+        float32_t d1, d2;                              /*  state variables           */
+        uint32_t sample, stage = S->numStages;         /*  loop counters             */
+
+   do
+   {
+      /* Reading the coefficients */
+      b0 = *pCoeffs++;
+      b1 = *pCoeffs++;
+      b2 = *pCoeffs++;
+      a1 = *pCoeffs++;
+      a2 = *pCoeffs++;
+
+      /*Reading the state values */
+      d1 = pState[0];
+      d2 = pState[1];
+
+      sample = blockSize;
+
+      while (sample > 0U)
+      {
+         /* Read the input */
+         Xn = *pIn++;
+
+         /* y[n] = b0 * x[n] + d1 */
+         acc = (b0 * Xn) + d1;
+
+         /* Store the result in the accumulator in the destination buffer. */
+         *pOut++ = acc;
+
+         /* Every time after the output is computed state should be updated. */
+         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+         d1 = (b1 * Xn + a1 * acc) + d2;
+
+         /* d2 = b2 * x[n] + a2 * y[n] */
+         d2 = (b2 * Xn) + (a2 * acc);
+
+         /* decrement the loop counter */
+         sample--;
+      }
+
+      /* Store the updated state variables back into the state array */
+      *pState++ = d1;
+      *pState++ = d2;
+
+      /* The current stage input is given as the output to the next stage */
+      pIn = pDst;
+
+      /*Reset the output working pointer */
+      pOut = pDst;
+
+      /* decrement the loop counter */
+      stage--;
+
+   } while (stage > 0U);
+}
+
+
+void ref_biquad_cascade_stereo_df2T_f32(
+	const arm_biquad_cascade_stereo_df2T_instance_f32 * S,
+	      float32_t * pSrc,
+	      float32_t * pDst,
+	      uint32_t blockSize)
+{
+        float32_t *pIn = pSrc;                         /*  source pointer            */
+        float32_t *pOut = pDst;                        /*  destination pointer       */
+        float32_t *pState = S->pState;                 /*  State pointer             */
+  const float32_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
+        float32_t acc1a, acc1b;                        /*  accumulator               */
+        float32_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
+        float32_t Xn1a, Xn1b;                          /*  temporary input           */
+        float32_t d1a, d2a, d1b, d2b;                  /*  state variables           */
+        uint32_t sample, stage = S->numStages;         /*  loop counters             */
+
+    do
+    {
+        /* Reading the coefficients */
+        b0 = *pCoeffs++;
+        b1 = *pCoeffs++;
+        b2 = *pCoeffs++;
+        a1 = *pCoeffs++;
+        a2 = *pCoeffs++;
+
+        /*Reading the state values */
+        d1a = pState[0];
+        d2a = pState[1];
+        d1b = pState[2];
+        d2b = pState[3];
+
+        sample = blockSize;
+
+        while (sample > 0U)
+        {
+            /* Read the input */
+            Xn1a = *pIn++; //Channel a
+            Xn1b = *pIn++; //Channel b
+
+            /* y[n] = b0 * x[n] + d1 */
+            acc1a = (b0 * Xn1a) + d1a;
+            acc1b = (b0 * Xn1b) + d1b;
+
+            /* Store the result in the accumulator in the destination buffer. */
+            *pOut++ = acc1a;
+            *pOut++ = acc1b;
+
+            /* Every time after the output is computed state should be updated. */
+            /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+            d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+            d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+            /* d2 = b2 * x[n] + a2 * y[n] */
+            d2a = (b2 * Xn1a) + (a2 * acc1a);
+            d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+            /* decrement the loop counter */
+            sample--;
+        }
+
+        /* Store the updated state variables back into the state array */
+        *pState++ = d1a;
+        *pState++ = d2a;
+        *pState++ = d1b;
+        *pState++ = d2b;
+
+        /* The current stage input is given as the output to the next stage */
+        pIn = pDst;
+
+        /*Reset the output working pointer */
+        pOut = pDst;
+
+        /* decrement the loop counter */
+        stage--;
+
+    } while (stage > 0U);
+	
+}
+
+void ref_biquad_cascade_df2T_f64(
+	const arm_biquad_cascade_df2T_instance_f64 * S,
+	float64_t * pSrc,
+	float64_t * pDst,
+	uint32_t blockSize)
+{
+   float64_t *pIn = pSrc;                         /*  source pointer            */
+   float64_t *pOut = pDst;                        /*  destination pointer       */
+   float64_t *pState = S->pState;                 /*  State pointer             */
+   float64_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
+   float64_t acc;                                 /*  accumulator               */
+   float64_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
+   float64_t Xn;                                  /*  temporary input           */
+   float64_t d1, d2;                              /*  state variables           */
+   uint32_t sample, stage = S->numStages;         /*  loop counters             */
+
+   do
+   {
+      /* Reading the coefficients */
+      b0 = *pCoeffs++;
+      b1 = *pCoeffs++;
+      b2 = *pCoeffs++;
+      a1 = *pCoeffs++;
+      a2 = *pCoeffs++;
+
+      /*Reading the state values */
+      d1 = pState[0];
+      d2 = pState[1];
+
+      sample = blockSize;
+
+      while (sample > 0U)
+      {
+         /* Read the input */
+         Xn = *pIn++;
+
+         /* y[n] = b0 * x[n] + d1 */
+         acc = (b0 * Xn) + d1;
+
+         /* Store the result in the accumulator in the destination buffer. */
+         *pOut++ = acc;
+
+         /* Every time after the output is computed state should be updated. */
+         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+         d1 = (b1 * Xn + a1 * acc) + d2;
+
+         /* d2 = b2 * x[n] + a2 * y[n] */
+         d2 = (b2 * Xn) + (a2 * acc);
+
+         /* decrement the loop counter */
+         sample--;
+      }
+
+      /* Store the updated state variables back into the state array */
+      *pState++ = d1;
+      *pState++ = d2;
+
+      /* The current stage input is given as the output to the next stage */
+      pIn = pDst;
+
+      /*Reset the output working pointer */
+      pOut = pDst;
+
+      /* decrement the loop counter */
+      stage--;
+
+   } while (stage > 0U);
+}
+
+void ref_biquad_cascade_df1_f32(
+  const arm_biquad_casd_df1_inst_f32 * S,
+        float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        float32_t *pIn = pSrc;                         /*  source pointer            */
+        float32_t *pOut = pDst;                        /*  destination pointer       */
+        float32_t *pState = S->pState;                 /*  pState pointer            */
+  const float32_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
+        float32_t acc;                                 /*  Simulates the accumulator */
+        float32_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
+        float32_t Xn1, Xn2, Yn1, Yn2;                  /*  Filter pState variables   */
+        float32_t Xn;                                  /*  temporary input           */
+        uint32_t sample, stage = S->numStages;         /*  loop counters             */
+
+  do
+  {
+    /* Reading the coefficients */
+    b0 = *pCoeffs++;
+    b1 = *pCoeffs++;
+    b2 = *pCoeffs++;
+    a1 = *pCoeffs++;
+    a2 = *pCoeffs++;
+
+    /* Reading the pState values */
+    Xn1 = pState[0];
+    Xn2 = pState[1];
+    Yn1 = pState[2];
+    Yn2 = pState[3];
+
+    /*      The variables acc holds the output value that is computed:        
+     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1]   + a2 * y[n-2]        
+     */
+
+    sample = blockSize;
+
+    while (sample > 0U)
+    {
+      /* Read the input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
+
+      /* Store the result in the accumulator in the destination buffer. */
+      *pOut++ = acc;
+
+      /* Every time after the output is computed state should be updated. */
+      /* The states should be updated as:    */
+      /* Xn2 = Xn1    */
+      /* Xn1 = Xn     */
+      /* Yn2 = Yn1    */
+      /* Yn1 = acc   */
+      Xn2 = Xn1;
+      Xn1 = Xn;
+      Yn2 = Yn1;
+      Yn1 = acc;
+
+      /* decrement the loop counter */
+      sample--;
+    }
+
+    /*  Store the updated state variables back into the pState array */
+    *pState++ = Xn1;
+    *pState++ = Xn2;
+    *pState++ = Yn1;
+    *pState++ = Yn2;
+
+    /*  The first stage goes from the input buffer to the output buffer. */
+    /*  Subsequent numStages  occur in-place in the output buffer */
+    pIn = pDst;
+
+    /* Reset the output pointer */
+    pOut = pDst;
+
+    /* decrement the loop counter */
+    stage--;
+
+  } while (stage > 0U);
+}
+
+void ref_biquad_cas_df1_32x64_q31(
+  const arm_biquad_cas_df1_32x64_ins_q31 * S,
+        q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        q31_t *pIn = pSrc;                             /*  input pointer initialization  			*/
+        q31_t *pOut = pDst;                            /*  output pointer initialization 			*/
+        q63_t *pState = S->pState;                     /*  state pointer initialization  			*/
+  const q31_t *pCoeffs = S->pCoeffs;                   /*  coeff pointer initialization  			*/
+        q63_t acc;                                     /*  accumulator                   			*/
+        q31_t Xn1, Xn2;                                /*  Input Filter state variables  			*/
+        q63_t Yn1, Yn2;                                /*  Output Filter state variables 			*/
+        q31_t b0, b1, b2, a1, a2;                      /*  Filter coefficients           			*/
+        q31_t Xn;                                      /*  temporary input               			*/
+        int32_t shift = (int32_t) S->postShift + 1;    /*  Shift to be applied to the output 	*/
+        uint32_t sample, stage = S->numStages;         /*  loop counters                     	*/
+        q31_t acc_l, acc_h;                            /*  temporary output               		*/
+        uint32_t uShift = ((uint32_t) S->postShift + 1U);
+        uint32_t lShift = 32U - uShift;                /*  Shift to be applied to the output 	*/
+
+  do
+  {
+    /* Reading the coefficients */
+    b0 = *pCoeffs++;
+    b1 = *pCoeffs++;
+    b2 = *pCoeffs++;
+    a1 = *pCoeffs++;
+    a2 = *pCoeffs++;
+
+    /* Reading the state values */
+    Xn1 = pState[0];
+    Xn2 = pState[1];
+    Yn1 = pState[2];
+    Yn2 = pState[3];
+
+    sample = blockSize;
+
+    while (sample > 0U)
+    {
+      /* Read the input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      acc = (q63_t)Xn*b0 + (q63_t)Xn1*b1 + (q63_t)Xn2*b2;
+      /* acc +=  a1 * y[n-1] */
+      acc += mult32x64(Yn1, a1);
+      /* acc +=  a2 * y[n-2] */
+      acc += mult32x64(Yn2, a2);
+
+      /* Every time after the output is computed state should be updated. */
+      Xn2 = Xn1;
+      Xn1 = Xn;
+      Yn2 = Yn1;
+
+      /* The result is converted to 1.63, Yn1 variable is reused  */
+      Yn1 = acc << shift;
+
+      /* Calc lower part of acc */
+      acc_l = acc & 0xffffffff;
+
+      /* Calc upper part of acc */
+      acc_h = (acc >> 32) & 0xffffffff;
+
+      /* Apply shift for lower part of acc and upper part of acc */
+      acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+      /* Store the output in the destination buffer in 1.31 format. */
+      *pOut++ = acc_h;
+
+      /* decrement the loop counter */
+      sample--;
+    }
+
+    /*  The first stage output is given as input to the second stage. */
+    pIn = pDst;
+
+    /* Reset to destination buffer working pointer */
+    pOut = pDst;
+
+    /*  Store the updated state variables back into the pState array */
+    *pState++ = (q63_t) Xn1;
+    *pState++ = (q63_t) Xn2;
+    *pState++ = Yn1;
+    *pState++ = Yn2;
+
+  } while (--stage);
+}
+
+void ref_biquad_cascade_df1_q31(
+  const arm_biquad_casd_df1_inst_q31 * S,
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{	
+        q63_t acc;                                     /*  accumulator                   */
+        uint32_t uShift = ((uint32_t) S->postShift + 1U);
+        uint32_t lShift = 32U - uShift;                /*  Shift to be applied to the output */
+        q31_t *pIn = pSrc;                             /*  input pointer initialization  */
+        q31_t *pOut = pDst;                            /*  output pointer initialization */
+        q31_t *pState = S->pState;                     /*  pState pointer initialization */
+  const q31_t *pCoeffs = S->pCoeffs;                   /*  coeff pointer initialization  */
+        q31_t Xn1, Xn2, Yn1, Yn2;                      /*  Filter state variables        */
+        q31_t b0, b1, b2, a1, a2;                      /*  Filter coefficients           */
+        q31_t Xn;                                      /*  temporary input               */
+        uint32_t sample, stage = S->numStages;         /*  loop counters                 */
+
+  do
+  {
+    /* Reading the coefficients */
+    b0 = *pCoeffs++;
+    b1 = *pCoeffs++;
+    b2 = *pCoeffs++;
+    a1 = *pCoeffs++;
+    a2 = *pCoeffs++;
+
+    /* Reading the state values */
+    Xn1 = pState[0];
+    Xn2 = pState[1];
+    Yn1 = pState[2];
+    Yn2 = pState[3];
+
+    /*      The variables acc holds the output value that is computed:         
+     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]         
+     */
+
+    sample = blockSize;
+
+    while (sample > 0U)
+    {
+      /* Read the input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      /* acc =  b0 * x[n] */
+      acc = (q63_t) b0 *Xn;
+
+      /* acc +=  b1 * x[n-1] */
+      acc += (q63_t) b1 *Xn1;
+      /* acc +=  b[2] * x[n-2] */
+      acc += (q63_t) b2 *Xn2;
+      /* acc +=  a1 * y[n-1] */
+      acc += (q63_t) a1 *Yn1;
+      /* acc +=  a2 * y[n-2] */
+      acc += (q63_t) a2 *Yn2;
+
+      /* The result is converted to 1.31  */
+      acc = acc >> lShift;
+
+      /* Every time after the output is computed state should be updated. */
+      /* The states should be updated as:  */
+      /* Xn2 = Xn1    */
+      /* Xn1 = Xn     */
+      /* Yn2 = Yn1    */
+      /* Yn1 = acc    */
+      Xn2 = Xn1;
+      Xn1 = Xn;
+      Yn2 = Yn1;
+      Yn1 = (q31_t) acc;
+
+      /* Store the output in the destination buffer. */
+      *pOut++ = (q31_t) acc;
+
+      /* decrement the loop counter */
+      sample--;
+    }
+
+    /*  The first stage goes from the input buffer to the output buffer. */
+    /*  Subsequent stages occur in-place in the output buffer */
+    pIn = pDst;
+
+    /* Reset to destination pointer */
+    pOut = pDst;
+
+    /*  Store the updated state variables back into the pState array */
+    *pState++ = Xn1;
+    *pState++ = Xn2;
+    *pState++ = Yn1;
+    *pState++ = Yn2;
+
+  } while (--stage);
+}
+
+
+void ref_biquad_cascade_df1_fast_q31(
+  const arm_biquad_casd_df1_inst_q31 * S,
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+        q31_t acc = 0;                                 /*  accumulator                   */
+        q31_t Xn1, Xn2, Yn1, Yn2;                      /*  Filter state variables        */
+        q31_t b0, b1, b2, a1, a2;                      /*  Filter coefficients           */
+        q31_t *pIn = pSrc;                             /*  input pointer initialization  */
+        q31_t *pOut = pDst;                            /*  output pointer initialization */
+        q31_t *pState = S->pState;                     /*  pState pointer initialization */
+  const q31_t *pCoeffs = S->pCoeffs;                   /*  coeff pointer initialization  */
+        q31_t Xn;                                      /*  temporary input               */
+        int32_t shift = (int32_t) S->postShift + 1;    /*  Shift to be applied to the output */
+        uint32_t sample, stage = S->numStages;         /*  loop counters                     */
+
+  do
+  {
+    /* Reading the coefficients */
+    b0 = *pCoeffs++;
+    b1 = *pCoeffs++;
+    b2 = *pCoeffs++;
+    a1 = *pCoeffs++;
+    a2 = *pCoeffs++;
+
+    /* Reading the state values */
+    Xn1 = pState[0];
+    Xn2 = pState[1];
+    Yn1 = pState[2];
+    Yn2 = pState[3];
+		
+    sample = blockSize;
+
+   while (sample > 0U)
+   {
+      /* Read the input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */    
+      mult_32x32_keep32_R(acc, b0, Xn);
+      multAcc_32x32_keep32_R(acc, b1, Xn1);
+      multAcc_32x32_keep32_R(acc, b2, Xn2);
+      multAcc_32x32_keep32_R(acc, a1, Yn1);
+      multAcc_32x32_keep32_R(acc, a2, Yn2);
+
+      /* The result is converted to 1.31  */
+      acc <<= shift;
+
+      /* Every time after the output is computed state should be updated. */
+      Xn2 = Xn1;
+      Xn1 = Xn;
+      Yn2 = Yn1;
+      Yn1 = acc;
+
+      /* Store the output in the destination buffer. */
+      *pOut++ = acc;
+
+      /* decrement the loop counter */
+      sample--;
+   }
+
+    /*  The first stage goes from the input buffer to the output buffer. */
+    /*  Subsequent stages occur in-place in the output buffer */
+    pIn = pDst;
+
+    /* Reset to destination pointer */
+    pOut = pDst;
+
+    /*  Store the updated state variables back into the pState array */
+    *pState++ = Xn1;
+    *pState++ = Xn2;
+    *pState++ = Yn1;
+    *pState++ = Yn2;
+
+  } while (--stage);
+}
+
+void ref_biquad_cascade_df1_fast_q15(
+  const arm_biquad_casd_df1_inst_q15 * S,
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+	      q15_t *pIn = pSrc;                             			/*  Source pointer                           */
+	      q15_t *pOut = pDst;                            			/*  Destination pointer                      */
+	      q15_t b0, b1, b2, a1, a2;                      			/*  Filter coefficients           				*/
+	      q15_t Xn1, Xn2, Yn1, Yn2;                      			/*  Filter state variables        				*/
+	      q15_t Xn;                                      			/*  temporary input               				*/
+	      q31_t acc;                                     			/*  Accumulator                              */
+	      int32_t shift = (15 - (int32_t) S->postShift); 			/*  Post shift                               */
+	      q15_t *pState = S->pState;                     			/*  State pointer                            */
+  const q15_t *pCoeffs = S->pCoeffs;                   			/*  Coefficient pointer                      */
+        uint32_t sample, stage = (uint32_t) S->numStages;   /*  Stage loop counter                          */
+
+  do
+  {
+    /* Reading the coefficients */
+    b0 = *pCoeffs++;
+    pCoeffs++;  // skip the 0 coefficient
+    b1 = *pCoeffs++;
+    b2 = *pCoeffs++;
+    a1 = *pCoeffs++;
+    a2 = *pCoeffs++;
+
+    /* Reading the state values */
+    Xn1 = pState[0];
+    Xn2 = pState[1];
+    Yn1 = pState[2];
+    Yn2 = pState[3];
+
+    sample = blockSize;
+
+    while (sample > 0U)
+    {
+      /* Read the input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      acc = (q31_t)b0*Xn + (q31_t)b1*Xn1 + (q31_t)b2*Xn2 + (q31_t)a1*Yn1 + (q31_t)a2*Yn2;
+
+      /* The result is converted to 1.15  */
+      acc = ref_sat_q15(acc >> shift);
+
+      /* Every time after the output is computed state should be updated. */
+      Xn2 = Xn1;
+      Xn1 = Xn;
+      Yn2 = Yn1;
+      Yn1 = (q15_t) acc;
+
+      /* Store the output in the destination buffer. */
+      *pOut++ = (q15_t) acc;
+
+      /* decrement the loop counter */
+      sample--;
+    }
+
+    /*  The first stage goes from the input buffer to the output buffer. */
+    /*  Subsequent stages occur in-place in the output buffer */
+    pIn = pDst;
+
+    /* Reset to destination pointer */
+    pOut = pDst;
+
+    /*  Store the updated state variables back into the pState array */
+    *pState++ = Xn1;
+    *pState++ = Xn2;
+    *pState++ = Yn1;
+    *pState++ = Yn2;
+
+  } while (--stage);
+}
+
+void ref_biquad_cascade_df1_q15(
+  const arm_biquad_casd_df1_inst_q15 * S,
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+	      q15_t *pIn = pSrc;                             			/*  Source pointer                           */
+	      q15_t *pOut = pDst;                            			/*  Destination pointer                      */
+	      q15_t b0, b1, b2, a1, a2;                      			/*  Filter coefficients           				*/
+	      q15_t Xn1, Xn2, Yn1, Yn2;                      			/*  Filter state variables        				*/
+	      q15_t Xn;                                      			/*  temporary input               				*/
+	      q63_t acc;                                     			/*  Accumulator                              */
+	      int32_t shift = (15 - (int32_t) S->postShift); 			/*  Post shift                               */
+	      q15_t *pState = S->pState;                     			/*  State pointer                            */
+  const q15_t *pCoeffs = S->pCoeffs;                   			/*  Coefficient pointer                      */
+        uint32_t sample, stage = (uint32_t) S->numStages;   /*  Stage loop counter                          */
+
+  do
+  {
+    /* Reading the coefficients */
+    b0 = *pCoeffs++;
+    pCoeffs++;  // skip the 0 coefficient
+    b1 = *pCoeffs++;
+    b2 = *pCoeffs++;
+    a1 = *pCoeffs++;
+    a2 = *pCoeffs++;
+
+    /* Reading the state values */
+    Xn1 = pState[0];
+    Xn2 = pState[1];
+    Yn1 = pState[2];
+    Yn2 = pState[3];
+
+    sample = blockSize;
+
+    while (sample > 0U)
+    {
+      /* Read the input */
+      Xn = *pIn++;
+
+      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+      acc = (q31_t)b0*Xn + (q31_t)b1*Xn1 + (q31_t)b2*Xn2 + (q31_t)a1*Yn1 + (q31_t)a2*Yn2;
+
+      /* The result is converted to 1.15  */
+      acc = ref_sat_q15(acc >> shift);
+
+      /* Every time after the output is computed state should be updated. */
+      Xn2 = Xn1;
+      Xn1 = Xn;
+      Yn2 = Yn1;
+      Yn1 = (q15_t) acc;
+
+      /* Store the output in the destination buffer. */
+      *pOut++ = (q15_t) acc;
+
+      /* decrement the loop counter */
+      sample--;
+    }
+
+    /*  The first stage goes from the input buffer to the output buffer. */
+    /*  Subsequent stages occur in-place in the output buffer */
+    pIn = pDst;
+
+    /* Reset to destination pointer */
+    pOut = pDst;
+
+    /*  Store the updated state variables back into the pState array */
+    *pState++ = Xn1;
+    *pState++ = Xn2;
+    *pState++ = Yn1;
+    *pState++ = Yn2;
+
+  } while (--stage);
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/conv.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/conv.c
new file mode 100644
index 0000000..dc1b103
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/conv.c
@@ -0,0 +1,350 @@
+#include "ref.h"
+
+void ref_conv_f32(
+  float32_t * pSrcA,
+  uint32_t 		srcALen,
+  float32_t * pSrcB,
+  uint32_t 		srcBLen,
+  float32_t * pDst)
+{
+  float32_t sum;                                 /* Accumulator */
+  uint32_t i, j;                                 /* loop counters */
+
+  /* Loop to calculate convolution for output length number of times */
+  for (i = 0; i < srcALen + srcBLen - 1; i++)
+  {
+    /* Initialize sum with zero to carry out MAC operations */
+    sum = 0.0f;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += pSrcB[i - j] * pSrcA[j];
+      }
+    }
+    /* Store the output in the destination buffer */
+    pDst[i] = sum;
+  }
+}
+
+arm_status ref_conv_partial_f32(
+  float32_t * pSrcA,
+  uint32_t srcALen,
+  float32_t * pSrcB,
+  uint32_t srcBLen,
+  float32_t * pDst,
+  uint32_t firstIndex,
+  uint32_t numPoints)
+{
+	ref_conv_f32(pSrcA,srcALen,pSrcB,srcBLen,pDst);
+	
+	return ARM_MATH_SUCCESS;
+}
+
+void ref_conv_q31(
+  q31_t * pSrcA,
+  uint32_t srcALen,
+  q31_t * pSrcB,
+  uint32_t srcBLen,
+  q31_t * pDst)
+{
+  q63_t sum;                                     /* Accumulator */
+  uint32_t i, j;                                 /* loop counter */
+
+  /* Loop to calculate output of convolution for output length number of times */
+  for (i = 0; i < srcALen + srcBLen - 1; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += (q63_t) pSrcA[j] * (pSrcB[i - j]);
+      }
+    }
+
+    /* Store the output in the destination buffer */
+    pDst[i] = (q31_t)(sum >> 31U);
+  }
+}
+
+void ref_conv_fast_q31(
+  q31_t * pSrcA,
+  uint32_t srcALen,
+  q31_t * pSrcB,
+  uint32_t srcBLen,
+  q31_t * pDst)
+{
+  q31_t sum;                                     /* Accumulator */
+  uint32_t i, j;                                 /* loop counter */
+
+  /* Loop to calculate output of convolution for output length number of times */
+  for (i = 0; i < srcALen + srcBLen - 1; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+				sum = (q31_t) ((((q63_t)sum << 32) +
+                      ((q63_t)pSrcA[j] * pSrcB[i - j])) >> 32);
+      }
+    }
+
+    /* Store the output in the destination buffer */
+    pDst[i] = (q31_t)(sum << 1U);
+  }
+}
+
+arm_status ref_conv_partial_q31(
+  q31_t * pSrcA,
+  uint32_t srcALen,
+  q31_t * pSrcB,
+  uint32_t srcBLen,
+  q31_t * pDst,
+  uint32_t firstIndex,
+  uint32_t numPoints)
+{
+	ref_conv_q31(pSrcA,srcALen,pSrcB,srcBLen,pDst);
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_conv_partial_fast_q31(
+  q31_t * pSrcA,
+  uint32_t srcALen,
+  q31_t * pSrcB,
+  uint32_t srcBLen,
+  q31_t * pDst,
+  uint32_t firstIndex,
+  uint32_t numPoints)
+{
+	ref_conv_fast_q31(pSrcA,srcALen,pSrcB,srcBLen,pDst);
+   
+	return ARM_MATH_SUCCESS;
+}
+
+void ref_conv_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst)
+{
+  q63_t sum;                                     /* Accumulator */
+  uint32_t i, j;                                 /* loop counter */
+
+  /* Loop to calculate output of convolution for output length number of times */
+  for (i = 0; i < srcALen + srcBLen - 1; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += (q31_t)pSrcA[j] * pSrcB[i - j];
+      }
+    }
+
+    /* Store the output in the destination buffer */
+    pDst[i] = ref_sat_q15(sum >> 15U);
+  }
+}
+
+arm_status ref_conv_partial_fast_opt_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst,
+  uint32_t firstIndex,
+  uint32_t numPoints,
+  q15_t * pScratch1,
+  q15_t * pScratch2)
+{
+  q31_t sum;                                     /* Accumulator */
+  uint32_t i, j;                                 /* loop counter */
+
+  /* Loop to calculate output of convolution for output length number of times */
+  for (i = 0; i < srcALen + srcBLen - 1; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += (q31_t)pSrcA[j] * pSrcB[i - j];
+      }
+    }
+
+    /* Store the output in the destination buffer */
+    pDst[i] = ref_sat_q15(sum >> 15U);
+  }
+	
+  return ARM_MATH_SUCCESS;
+}
+
+void ref_conv_fast_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst)
+{
+  q31_t sum;                                     /* Accumulator */
+  uint32_t i, j;                                 /* loop counter */
+
+  /* Loop to calculate output of convolution for output length number of times */
+  for (i = 0; i < srcALen + srcBLen - 1; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += (q31_t)pSrcA[j] * pSrcB[i - j];
+      }
+    }
+
+    /* Store the output in the destination buffer */
+    pDst[i] = sum >> 15U;
+  }
+}
+
+void ref_conv_fast_opt_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst,
+  q15_t * pScratch1,
+  q15_t * pScratch2)
+{
+  q31_t sum;                                     /* Accumulator */
+  uint32_t i, j;                                 /* loop counter */
+
+  /* Loop to calculate output of convolution for output length number of times */
+  for (i = 0; i < srcALen + srcBLen - 1; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += (q31_t)pSrcA[j] * pSrcB[i - j];
+      }
+    }
+
+    /* Store the output in the destination buffer */
+    pDst[i] = ref_sat_q15(sum >> 15U);
+  }
+}
+
+arm_status ref_conv_partial_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst,
+  uint32_t firstIndex,
+  uint32_t numPoints)
+{
+	ref_conv_q15(pSrcA,srcALen,pSrcB,srcBLen,pDst);
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_conv_partial_fast_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst,
+  uint32_t firstIndex,
+  uint32_t numPoints)
+{
+	ref_conv_fast_q15(pSrcA,srcALen,pSrcB,srcBLen,pDst);
+	
+	return ARM_MATH_SUCCESS;
+}
+
+
+void ref_conv_q7(
+  q7_t * pSrcA,
+  uint32_t srcALen,
+  q7_t * pSrcB,
+  uint32_t srcBLen,
+  q7_t * pDst)
+{
+  q31_t sum;                                     /* Accumulator */
+  uint32_t i, j;                                 /* loop counter */
+
+  /* Loop to calculate output of convolution for output length number of times */
+  for (i = 0; i < srcALen + srcBLen - 1; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += (q15_t)pSrcA[j] * pSrcB[i - j];
+      }
+    }
+
+    /* Store the output in the destination buffer */
+    pDst[i] = (q7_t)ref_sat_q7(sum >> 7);
+  }
+}
+
+arm_status ref_conv_partial_q7(
+  q7_t * pSrcA,
+  uint32_t srcALen,
+  q7_t * pSrcB,
+  uint32_t srcBLen,
+  q7_t * pDst,
+  uint32_t firstIndex,
+  uint32_t numPoints)
+{	
+	ref_conv_q7(pSrcA,srcALen,pSrcB,srcBLen,pDst);
+	
+	return ARM_MATH_SUCCESS;
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/correlate.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/correlate.c
new file mode 100644
index 0000000..ff1d95b
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/correlate.c
@@ -0,0 +1,513 @@
+#include "ref.h"
+
+void ref_correlate_f32(
+  float32_t * pSrcA,
+  uint32_t srcALen,
+  float32_t * pSrcB,
+  uint32_t srcBLen,
+  float32_t * pDst)
+{
+  float32_t *pIn1 = pSrcA;                       /* inputA pointer 			*/
+  float32_t *pIn2 = pSrcB + (srcBLen - 1U);      /* inputB pointer 			*/
+  float32_t sum;                                 /* Accumulator 				*/
+  uint32_t i = 0U, j;                            /* loop counters 			*/
+  uint32_t inv = 0U;                             /* Reverse order flag 	*/
+  uint32_t tot = 0U;                             /* Length 							*/
+
+  /* The algorithm implementation is based on the lengths of the inputs. 
+   * srcB is always made to slide across srcA. 
+   * So srcBLen is always considered as shorter or equal to srcALen 
+   * But CORR(x, y) is reverse of CORR(y, x) 
+   * So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer 
+   * and a variable, inv is set to 1 
+   * If lengths are not equal then zero pad has to be done to make the two    
+   * inputs of same length. But to improve the performance, we include zeroes    
+   * in the output instead of zero padding either of the the inputs
+   * If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the    
+   * starting of the output buffer 
+   * If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the   
+   * ending of the output buffer 
+   * Once the zero padding is done the remaining of the output is calcualted   
+   * using convolution but with the shorter signal time shifted. 
+	 */
+
+  /* Calculate the length of the remaining sequence */
+  tot = srcALen + srcBLen - 2U;
+
+  if (srcALen > srcBLen)
+  {
+    /* Calculating the number of zeros to be padded to the output */
+    /* Initialise the pointer after zero padding */
+    pDst += srcALen - srcBLen;
+  }
+  else if (srcALen < srcBLen)
+  {
+    /* Initialization to inputB pointer */
+    pIn1 = pSrcB;
+
+    /* Initialization to the end of inputA pointer */
+    pIn2 = pSrcA + srcALen - 1U;
+
+    /* Initialisation of the pointer after zero padding */
+    pDst += tot;
+
+    /* Swapping the lengths */
+    j = srcALen;
+    srcALen = srcBLen;
+    srcBLen = j;
+
+    /* Setting the reverse flag */
+    inv = 1;
+  }
+
+  /* Loop to calculate convolution for output length number of times */
+  for (i = 0U; i <= tot; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0.0f;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0U; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((i - j < srcBLen) && (j < srcALen))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += pIn1[j] * pIn2[-((int32_t)i - j)];
+      }
+    }
+    /* Store the output in the destination buffer */
+    if (inv == 1)
+      *pDst-- = sum;
+    else
+      *pDst++ = sum;
+  }
+}
+
+void ref_correlate_q31(
+  q31_t * pSrcA,
+  uint32_t srcALen,
+  q31_t * pSrcB,
+  uint32_t srcBLen,
+  q31_t * pDst)
+{
+  q31_t *pIn1 = pSrcA;                           /* inputA pointer               */
+  q31_t *pIn2 = pSrcB + (srcBLen - 1U);          /* inputB pointer               */
+  q63_t sum;                                     /* Accumulators                  */
+  uint32_t i = 0U, j;                            /* loop counters */
+  uint32_t inv = 0U;                             /* Reverse order flag */
+  uint32_t tot = 0U;                             /* Length */
+
+  /* Calculate the length of the remaining sequence */
+  tot = ((srcALen + srcBLen) - 2U);
+
+  if (srcALen > srcBLen)
+  {
+    /* Calculating the number of zeros to be padded to the output */
+    j = srcALen - srcBLen;
+
+    /* Initialise the pointer after zero padding */
+    pDst += j;
+  }
+
+  else if (srcALen < srcBLen)
+  {
+    /* Initialization to inputB pointer */
+    pIn1 = pSrcB;
+
+    /* Initialization to the end of inputA pointer */
+    pIn2 = pSrcA + (srcALen - 1U);
+
+    /* Initialisation of the pointer after zero padding */
+    pDst = pDst + tot;
+
+    /* Swapping the lengths */
+    j = srcALen;
+    srcALen = srcBLen;
+    srcBLen = j;
+
+    /* Setting the reverse flag */
+    inv = 1;
+
+  }
+
+  /* Loop to calculate correlation for output length number of times */
+  for (i = 0U; i <= tot; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to correlation equation */
+    for (j = 0U; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((((i - j) < srcBLen) && (j < srcALen)))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += ((q63_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
+      }
+    }
+    /* Store the output in the destination buffer */
+    if (inv == 1)
+      *pDst-- = (q31_t)(sum >> 31U);
+    else
+      *pDst++ = (q31_t)(sum >> 31U);
+  }
+}
+
+void ref_correlate_fast_q31(
+  q31_t * pSrcA,
+  uint32_t srcALen,
+  q31_t * pSrcB,
+  uint32_t srcBLen,
+  q31_t * pDst)
+{
+  q31_t *pIn1 = pSrcA;                           /* inputA pointer               */
+  q31_t *pIn2 = pSrcB + (srcBLen - 1U);          /* inputB pointer               */
+  q63_t sum;                                     /* Accumulators                  */
+  uint32_t i = 0U, j;                            /* loop counters */
+  uint32_t inv = 0U;                             /* Reverse order flag */
+  uint32_t tot = 0U;                             /* Length */
+
+  /* Calculate the length of the remaining sequence */
+  tot = ((srcALen + srcBLen) - 2U);
+
+  if (srcALen > srcBLen)
+  {
+    /* Calculating the number of zeros to be padded to the output */
+    j = srcALen - srcBLen;
+
+    /* Initialise the pointer after zero padding */
+    pDst += j;
+  }
+
+  else if (srcALen < srcBLen)
+  {
+    /* Initialization to inputB pointer */
+    pIn1 = pSrcB;
+
+    /* Initialization to the end of inputA pointer */
+    pIn2 = pSrcA + (srcALen - 1U);
+
+    /* Initialisation of the pointer after zero padding */
+    pDst = pDst + tot;
+
+    /* Swapping the lengths */
+    j = srcALen;
+    srcALen = srcBLen;
+    srcBLen = j;
+
+    /* Setting the reverse flag */
+    inv = 1;
+
+  }
+
+  /* Loop to calculate correlation for output length number of times */
+  for (i = 0U; i <= tot; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to correlation equation */
+    for (j = 0U; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((((i - j) < srcBLen) && (j < srcALen)))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum = (q31_t) ((((q63_t) sum << 32) +
+												((q63_t) pIn1[j] * pIn2[-((int32_t) i - j)])) >> 32);
+      }
+    }
+    /* Store the output in the destination buffer */
+    if (inv == 1)
+      *pDst-- = (q31_t)(sum << 1U);
+    else
+      *pDst++ = (q31_t)(sum << 1U);
+  }          
+}
+
+void ref_correlate_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst)
+{
+  q15_t *pIn1 = pSrcA;                           /* inputA pointer               */
+  q15_t *pIn2 = pSrcB + (srcBLen - 1U);          /* inputB pointer               */
+  q63_t sum;                                     /* Accumulators                  */
+  uint32_t i = 0U, j;                            /* loop counters */
+  uint32_t inv = 0U;                             /* Reverse order flag */
+  uint32_t tot = 0U;                             /* Length */
+
+  /* Calculate the length of the remaining sequence */
+  tot = ((srcALen + srcBLen) - 2U);
+
+  if (srcALen > srcBLen)
+  {
+    /* Calculating the number of zeros to be padded to the output */
+    j = srcALen - srcBLen;
+
+    /* Initialise the pointer after zero padding */
+    pDst += j;
+  }
+
+  else if (srcALen < srcBLen)
+  {
+    /* Initialization to inputB pointer */
+    pIn1 = pSrcB;
+
+    /* Initialization to the end of inputA pointer */
+    pIn2 = pSrcA + (srcALen - 1U);
+
+    /* Initialisation of the pointer after zero padding */
+    pDst = pDst + tot;
+
+    /* Swapping the lengths */
+    j = srcALen;
+    srcALen = srcBLen;
+    srcBLen = j;
+
+    /* Setting the reverse flag */
+    inv = 1;
+
+  }
+
+  /* Loop to calculate convolution for output length number of times */
+  for (i = 0U; i <= tot; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0U; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((((i - j) < srcBLen) && (j < srcALen)))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
+      }
+    }
+    /* Store the output in the destination buffer */
+    if (inv == 1)
+      *pDst-- = (q15_t) ref_sat_q15(sum >> 15U);
+    else
+      *pDst++ = (q15_t) ref_sat_q15(sum >> 15U);
+  }
+}
+
+void ref_correlate_fast_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst)
+{
+  q15_t *pIn1 = pSrcA;                           /* inputA pointer               */
+  q15_t *pIn2 = pSrcB + (srcBLen - 1U);          /* inputB pointer               */
+  q63_t sum;                                     /* Accumulators                  */
+  uint32_t i = 0U, j;                            /* loop counters */
+  uint32_t inv = 0U;                             /* Reverse order flag */
+  uint32_t tot = 0U;                             /* Length */
+
+  /* Calculate the length of the remaining sequence */
+  tot = ((srcALen + srcBLen) - 2U);
+
+  if (srcALen > srcBLen)
+  {
+    /* Calculating the number of zeros to be padded to the output */
+    j = srcALen - srcBLen;
+
+    /* Initialise the pointer after zero padding */
+    pDst += j;
+  }
+
+  else if (srcALen < srcBLen)
+  {
+    /* Initialization to inputB pointer */
+    pIn1 = pSrcB;
+
+    /* Initialization to the end of inputA pointer */
+    pIn2 = pSrcA + (srcALen - 1U);
+
+    /* Initialisation of the pointer after zero padding */
+    pDst = pDst + tot;
+
+    /* Swapping the lengths */
+    j = srcALen;
+    srcALen = srcBLen;
+    srcBLen = j;
+
+    /* Setting the reverse flag */
+    inv = 1;
+
+  }
+
+  /* Loop to calculate convolution for output length number of times */
+  for (i = 0U; i <= tot; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0U; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((((i - j) < srcBLen) && (j < srcALen)))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
+      }
+    }
+    /* Store the output in the destination buffer */
+    if (inv == 1)
+      *pDst-- = (q15_t)(sum >> 15U);
+    else
+      *pDst++ = (q15_t)(sum >> 15U);
+  }
+}
+
+void ref_correlate_fast_opt_q15(
+  q15_t * pSrcA,
+  uint32_t srcALen,
+  q15_t * pSrcB,
+  uint32_t srcBLen,
+  q15_t * pDst,
+  q15_t * pScratch)
+{
+  q15_t *pIn1 = pSrcA;                           /* inputA pointer               */
+  q15_t *pIn2 = pSrcB + (srcBLen - 1U);          /* inputB pointer               */
+  q31_t sum;                                     /* Accumulators                  */
+  uint32_t i = 0U, j;                            /* loop counters */
+  uint32_t inv = 0U;                             /* Reverse order flag */
+  uint32_t tot = 0U;                             /* Length */
+
+  /* Calculate the length of the remaining sequence */
+  tot = ((srcALen + srcBLen) - 2U);
+
+  if (srcALen > srcBLen)
+  {
+    /* Calculating the number of zeros to be padded to the output */
+    j = srcALen - srcBLen;
+
+    /* Initialise the pointer after zero padding */
+    pDst += j;
+  }
+
+  else if (srcALen < srcBLen)
+  {
+    /* Initialization to inputB pointer */
+    pIn1 = pSrcB;
+
+    /* Initialization to the end of inputA pointer */
+    pIn2 = pSrcA + (srcALen - 1U);
+
+    /* Initialisation of the pointer after zero padding */
+    pDst = pDst + tot;
+
+    /* Swapping the lengths */
+    j = srcALen;
+    srcALen = srcBLen;
+    srcBLen = j;
+
+    /* Setting the reverse flag */
+    inv = 1;
+
+  }
+
+  /* Loop to calculate convolution for output length number of times */
+  for (i = 0U; i <= tot; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0U; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((((i - j) < srcBLen) && (j < srcALen)))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
+      }
+    }
+    /* Store the output in the destination buffer */
+    if (inv == 1)
+      *pDst-- = (q15_t) ref_sat_q15(sum >> 15U);
+    else
+      *pDst++ = (q15_t) ref_sat_q15(sum >> 15U);
+  }
+}
+
+void ref_correlate_q7(
+  q7_t * pSrcA,
+  uint32_t srcALen,
+  q7_t * pSrcB,
+  uint32_t srcBLen,
+  q7_t * pDst)
+{
+  q7_t *pIn1 = pSrcA;                            /* inputA pointer */
+  q7_t *pIn2 = pSrcB + (srcBLen - 1U);           /* inputB pointer */
+  q31_t sum;                                     /* Accumulator */
+  uint32_t i = 0U, j;                            /* loop counters */
+  uint32_t inv = 0U;                             /* Reverse order flag */
+  uint32_t tot = 0U;                             /* Length */
+
+  /* Calculate the length of the remaining sequence */
+  tot = ((srcALen + srcBLen) - 2U);
+
+  if (srcALen > srcBLen)
+  {
+    /* Calculating the number of zeros to be padded to the output */
+    j = srcALen - srcBLen;
+
+    /* Initialise the pointer after zero padding */
+    pDst += j;
+  }
+
+  else if (srcALen < srcBLen)
+  {
+    /* Initialization to inputB pointer */
+    pIn1 = pSrcB;
+
+    /* Initialization to the end of inputA pointer */
+    pIn2 = pSrcA + (srcALen - 1U);
+
+    /* Initialisation of the pointer after zero padding */
+    pDst = pDst + tot;
+
+    /* Swapping the lengths */
+    j = srcALen;
+    srcALen = srcBLen;
+    srcBLen = j;
+
+    /* Setting the reverse flag */
+    inv = 1;
+
+  }
+
+  /* Loop to calculate convolution for output length number of times */
+  for (i = 0U; i <= tot; i++)
+  {
+    /* Initialize sum with zero to carry on MAC operations */
+    sum = 0;
+
+    /* Loop to perform MAC operations according to convolution equation */
+    for (j = 0U; j <= i; j++)
+    {
+      /* Check the array limitations */
+      if ((((i - j) < srcBLen) && (j < srcALen)))
+      {
+        /* z[i] += x[i-j] * y[j] */
+        sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
+      }
+    }
+    /* Store the output in the destination buffer */
+    if (inv == 1)
+      *pDst-- = (q7_t) __SSAT((sum >> 7U), 8U);
+    else
+      *pDst++ = (q7_t) __SSAT((sum >> 7U), 8U);
+  }
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir.c
new file mode 100644
index 0000000..9867f4c
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir.c
@@ -0,0 +1,325 @@
+#include "ref.h"
+
+void ref_fir_f32(
+	const arm_fir_instance_f32 * S,
+	      float32_t * pSrc,
+	      float32_t * pDst,
+	      uint32_t blockSize)
+{
+        float32_t *pState = S->pState;                 /* State pointer */
+	const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        float32_t *pStateCurnt;                        /* Points to the current sample of the state */
+        uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+        uint32_t i;                    								/* Loop counters */
+        float32_t acc;
+
+   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+   /* pStateCurnt points to the location where the new input data should be written */
+   pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+   while (blockSize > 0U)
+   {
+      /* Copy one sample at a time into state buffer */
+      *pStateCurnt++ = *pSrc++;
+
+      /* Set the accumulator to zero */
+      acc = 0.0f;
+
+			for(i=0;i<numTaps;i++)
+			{
+				/* Perform the multiply-accumulates */
+				acc += pState[i] * pCoeffs[i];
+			}
+
+      /* The result is store in the destination buffer. */
+      *pDst++ = acc;
+
+      /* Advance state pointer by 1 for the next sample */
+      pState++;
+
+      blockSize--;
+   }
+
+   /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the starting of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+   /* Points to the start of the state buffer */
+   pStateCurnt = S->pState;
+
+   /* Copy data */
+	 for(i=0;i<numTaps-1;i++)
+	 {
+      pStateCurnt[i] = pState[i];
+	 }
+}
+
+void ref_fir_q31(
+  const arm_fir_instance_q31 * S,
+        q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        q31_t *pState = S->pState;                 /* State pointer */
+	const q31_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        q31_t *pStateCurnt;                        /* Points to the current sample of the state */
+        uint32_t numTaps = S->numTaps;             /* Number of filter coefficients in the filter */
+        uint32_t i;                                /* Loop counter */
+        q63_t acc;
+
+   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+   /* pStateCurnt points to the location where the new input data should be written */
+   pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+   while (blockSize > 0U)
+   {
+      /* Copy one sample at a time into state buffer */
+      *pStateCurnt++ = *pSrc++;
+
+      /* Set the accumulator to zero */
+      acc = 0.0f;
+
+			for(i=0;i<numTaps;i++)
+			{
+				/* Perform the multiply-accumulates */
+				acc += (q63_t)pState[i] * pCoeffs[i];
+			}
+
+      /* The result is store in the destination buffer. */
+      *pDst++ = (q31_t)(acc >> 31);
+
+      /* Advance state pointer by 1 for the next sample */
+      pState++;
+
+      blockSize--;
+   }
+
+   /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the starting of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+   /* Points to the start of the state buffer */
+   pStateCurnt = S->pState;
+
+   /* Copy data */
+	 for(i=0;i<numTaps-1;i++)
+	 {
+      pStateCurnt[i] = pState[i];
+	 }
+}
+
+void ref_fir_fast_q31(
+  const arm_fir_instance_q31 * S,
+        q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        q31_t *pState = S->pState;                 /* State pointer */
+	const q31_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        q31_t *pStateCurnt;                        /* Points to the current sample of the state */
+        uint32_t numTaps = S->numTaps;             /* Number of filter coefficients in the filter */
+        uint32_t i;                                /* Loop counter */
+        q31_t acc;
+
+   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+   /* pStateCurnt points to the location where the new input data should be written */
+   pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+   while (blockSize > 0U)
+   {
+      /* Copy one sample at a time into state buffer */
+      *pStateCurnt++ = *pSrc++;
+
+      /* Set the accumulator to zero */
+      acc = 0.0f;
+
+			for(i=0;i<numTaps;i++)
+			{
+				/* Perform the multiply-accumulates */
+				acc = (q31_t) (((((q63_t) acc) << 32) + ((q63_t) pState[i] * pCoeffs[i]) + 0x80000000LL ) >> 32);
+			}
+
+      /* The result is store in the destination buffer. */
+      *pDst++ = (q31_t)(acc << 1);
+
+      /* Advance state pointer by 1 for the next sample */
+      pState++;
+
+      blockSize--;
+   }
+
+   /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the starting of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+   /* Points to the start of the state buffer */
+   pStateCurnt = S->pState;
+
+   /* Copy data */
+	 for(i=0;i<numTaps-1;i++)
+	 {
+      pStateCurnt[i] = pState[i];
+	 }
+}
+
+void ref_fir_q15(
+  const arm_fir_instance_q15 * S,
+        q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        q15_t *pState = S->pState;                 /* State pointer */
+	const q15_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        q15_t *pStateCurnt;                        /* Points to the current sample of the state */
+        uint32_t numTaps = S->numTaps;             /* Number of filter coefficients in the filter */
+        uint32_t i;                                /* Loop counter */
+        q63_t acc;
+
+   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+   /* pStateCurnt points to the location where the new input data should be written */
+   pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+   while (blockSize > 0U)
+   {
+      /* Copy one sample at a time into state buffer */
+      *pStateCurnt++ = *pSrc++;
+
+      /* Set the accumulator to zero */
+      acc = 0.0f;
+
+			for(i=0;i<numTaps;i++)
+			{
+				/* Perform the multiply-accumulates */
+				acc += (q31_t)pState[i] * pCoeffs[i];
+			}
+
+      /* The result is store in the destination buffer. */
+      *pDst++ = ref_sat_q15(acc >> 15);
+
+      /* Advance state pointer by 1 for the next sample */
+      pState++;
+
+      blockSize--;
+   }
+
+   /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the starting of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+   /* Points to the start of the state buffer */
+   pStateCurnt = S->pState;
+
+   /* Copy data */
+	 for(i=0;i<numTaps;i++)
+	 {
+      pStateCurnt[i] = pState[i];
+	 }
+}
+
+void ref_fir_fast_q15(
+  const arm_fir_instance_q15 * S,
+        q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        q15_t *pState = S->pState;                 /* State pointer */
+	const q15_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        q15_t *pStateCurnt;                        /* Points to the current sample of the state */
+        uint32_t numTaps = S->numTaps;             /* Number of filter coefficients in the filter */
+        uint32_t i;                                /* Loop counter */
+        q31_t acc;
+
+   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+   /* pStateCurnt points to the location where the new input data should be written */
+   pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+   while (blockSize > 0U)
+   {
+      /* Copy one sample at a time into state buffer */
+      *pStateCurnt++ = *pSrc++;
+
+      /* Set the accumulator to zero */
+      acc = 0.0f;
+
+			for(i=0;i<numTaps;i++)
+			{
+				/* Perform the multiply-accumulates */
+				acc += (q31_t)pState[i] * pCoeffs[i];
+			}
+
+      /* The result is store in the destination buffer. */
+      *pDst++ = ref_sat_q15(acc >> 15);
+
+      /* Advance state pointer by 1 for the next sample */
+      pState++;
+
+      blockSize--;
+   }
+
+   /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the starting of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+   /* Points to the start of the state buffer */
+   pStateCurnt = S->pState;
+
+   /* Copy data */
+	 for(i=0;i<numTaps-1;i++)
+	 {
+      pStateCurnt[i] = pState[i];
+	 }
+}
+
+void ref_fir_q7(
+  const arm_fir_instance_q7 * S,
+        q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        q7_t *pState = S->pState;                 /* State pointer */
+	const q7_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        q7_t *pStateCurnt;                        /* Points to the current sample of the state */
+        uint32_t numTaps = S->numTaps;            /* Number of filter coefficients in the filter */
+        uint32_t i;                               /* Loop counter */
+        q31_t acc;
+
+   /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+   /* pStateCurnt points to the location where the new input data should be written */
+   pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+   while (blockSize > 0U)
+   {
+      /* Copy one sample at a time into state buffer */
+      *pStateCurnt++ = *pSrc++;
+
+      /* Set the accumulator to zero */
+      acc = 0.0f;
+
+			for(i=0;i<numTaps;i++)
+			{
+				/* Perform the multiply-accumulates */
+				acc += (q31_t)pState[i] * pCoeffs[i];
+			}
+
+      /* The result is store in the destination buffer. */
+      *pDst++ = ref_sat_q7(acc >> 7);
+
+      /* Advance state pointer by 1 for the next sample */
+      pState++;
+
+      blockSize--;
+   }
+
+   /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the starting of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+   /* Points to the start of the state buffer */
+   pStateCurnt = S->pState;
+
+   /* Copy data */
+	 for(i=0;i<numTaps-1;i++)
+	 {
+      pStateCurnt[i] = pState[i];
+	 }
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_decimate.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_decimate.c
new file mode 100644
index 0000000..a743e73
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_decimate.c
@@ -0,0 +1,386 @@
+#include "ref.h"
+
+void ref_fir_decimate_f32(
+  const arm_fir_decimate_instance_f32 * S,
+  float32_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+        float32_t *pState = S->pState;                 /* State pointer */
+  const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        float32_t *pStateCurnt;                        /* Points to the current sample of the state */
+        float32_t sum0;                                /* Accumulator */
+        float32_t x0, c0;                              /* Temporary variables to hold state and coefficient values */
+        uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+        uint32_t i, blkCnt;  									 				 /* Loop counters */
+
+  /* S->pState buffer contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = S->pState + numTaps - 1U;
+
+  /* Total number of output samples to be computed */
+  blkCnt = blockSize / S->M;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy decimation factor number of new input samples into the state buffer */
+    i = S->M;
+
+    do
+    {
+      *pStateCurnt++ = *pSrc++;
+    } while (--i);
+
+    /* Set accumulator to zero */
+    sum0 = 0.0f;
+
+		for(i=0;i<numTaps;i++)
+		{
+      /* Read coefficients */
+      c0 = pCoeffs[i];
+
+      /* Fetch 1 state variable */
+      x0 = pState[i];
+
+      /* Perform the multiply-accumulate */
+      sum0 += x0 * c0;
+		}
+
+    /* Advance the state pointer by the decimation factor           
+     * to process the next group of decimation factor number samples */
+    pState += S->M;
+
+    /* The result is in the accumulator, store in the destination buffer. */
+    *pDst++ = sum0;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+  /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the start of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+  /* Copy numTaps number of values */
+  i = numTaps - 1U;
+
+  /* copy data */
+  while (i > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    i--;
+  }
+}
+
+void ref_fir_decimate_q31(
+  const arm_fir_decimate_instance_q31 * S,
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+        q31_t *pState = S->pState;                     /* State pointer */
+  const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        q31_t *pStateCurnt;                            /* Points to the current sample of the state */
+        q31_t x0, c0;                                  /* Temporary variables to hold state and coefficient values */
+        q63_t sum0;                                    /* Accumulator */
+        uint32_t numTaps = S->numTaps;                 /* Number of taps */
+        uint32_t i, blkCnt;  													 /* Loop counters */
+
+  /* S->pState buffer contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = S->pState + numTaps - 1U;
+
+  /* Total number of output samples to be computed */
+  blkCnt = blockSize / S->M;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy decimation factor number of new input samples into the state buffer */
+    i = S->M;
+
+    do
+    {
+      *pStateCurnt++ = *pSrc++;
+
+    } while (--i);
+
+    /* Set accumulator to zero */
+    sum0 = 0;
+
+		for(i=0;i<numTaps;i++)
+		{
+      /* Read coefficients */
+      c0 = pCoeffs[i];
+
+      /* Fetch 1 state variable */
+      x0 = pState[i];
+
+      /* Perform the multiply-accumulate */
+      sum0 += (q63_t)x0 * c0;
+		}
+
+    /* Advance the state pointer by the decimation factor           
+     * to process the next group of decimation factor number samples */
+    pState = pState + S->M;
+
+    /* The result is in the accumulator, store in the destination buffer. */
+    *pDst++ = (q31_t) (sum0 >> 31);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the start of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+  i = numTaps - 1U;
+
+  /* copy data */
+  while (i > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    i--;
+  }
+}
+
+void ref_fir_decimate_fast_q31(
+  const arm_fir_decimate_instance_q31 * S,
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+        q31_t *pState = S->pState;                     /* State pointer */
+  const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        q31_t *pStateCurnt;                            /* Points to the current sample of the state */
+        q31_t x0, c0;                                  /* Temporary variables to hold state and coefficient values */
+        q31_t sum0;                                    /* Accumulator */
+        uint32_t numTaps = S->numTaps;                 /* Number of taps */
+        uint32_t i, blkCnt;  													 /* Loop counters */
+
+  /* S->pState buffer contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = S->pState + numTaps - 1U;
+
+  /* Total number of output samples to be computed */
+  blkCnt = blockSize / S->M;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy decimation factor number of new input samples into the state buffer */
+    i = S->M;
+
+    do
+    {
+      *pStateCurnt++ = *pSrc++;
+
+    } while (--i);
+
+    /* Set accumulator to zero */
+    sum0 = 0;
+
+		for(i=0;i<numTaps;i++)
+		{
+      /* Read coefficients */
+      c0 = pCoeffs[i];
+
+      /* Fetch 1 state variable */
+      x0 = pState[i];
+
+      /* Perform the multiply-accumulate */
+			sum0 = (q31_t)((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
+		}
+
+    /* Advance the state pointer by the decimation factor           
+     * to process the next group of decimation factor number samples */
+    pState = pState + S->M;
+
+    /* The result is in the accumulator, store in the destination buffer. */
+    *pDst++ = (q31_t) (sum0 << 1);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the start of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+  i = numTaps - 1U;
+
+  /* copy data */
+  while (i > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    i--;
+  }
+}
+
+void ref_fir_decimate_q15(
+  const arm_fir_decimate_instance_q15 * S,
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+        q15_t *pState = S->pState;                     /* State pointer */
+  const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        q15_t *pStateCurnt;                            /* Points to the current sample of the state */
+        q31_t x0, c0;                                  /* Temporary variables to hold state and coefficient values */
+        q63_t sum0;                                    /* Accumulator */
+        uint32_t numTaps = S->numTaps;                 /* Number of taps */
+        uint32_t i, blkCnt;  													 /* Loop counters */
+
+  /* S->pState buffer contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = S->pState + numTaps - 1U;
+
+  /* Total number of output samples to be computed */
+  blkCnt = blockSize / S->M;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy decimation factor number of new input samples into the state buffer */
+    i = S->M;
+
+    do
+    {
+      *pStateCurnt++ = *pSrc++;
+
+    } while (--i);
+
+    /* Set accumulator to zero */
+    sum0 = 0;
+
+		for(i=0;i<numTaps;i++)
+		{
+      /* Read coefficients */
+      c0 = pCoeffs[i];
+
+      /* Fetch 1 state variable */
+      x0 = pState[i];
+
+      /* Perform the multiply-accumulate */
+      sum0 += (q31_t)x0 * c0;
+		}
+
+    /* Advance the state pointer by the decimation factor           
+     * to process the next group of decimation factor number samples */
+    pState = pState + S->M;
+
+    /* The result is in the accumulator, store in the destination buffer. */
+    *pDst++ = ref_sat_q15(sum0 >> 15);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the start of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+  i = numTaps - 1U;
+
+  /* copy data */
+  while (i > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    i--;
+  }
+}
+
+void ref_fir_decimate_fast_q15(
+  const arm_fir_decimate_instance_q15 * S,
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+        q15_t *pState = S->pState;                     /* State pointer */
+  const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        q15_t *pStateCurnt;                            /* Points to the current sample of the state */
+        q15_t x0, c0;                                  /* Temporary variables to hold state and coefficient values */
+        q31_t sum0;                                    /* Accumulator */
+        uint32_t numTaps = S->numTaps;                 /* Number of taps */
+        uint32_t i, blkCnt;  													 /* Loop counters */
+
+  /* S->pState buffer contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = S->pState + numTaps - 1U;
+
+  /* Total number of output samples to be computed */
+  blkCnt = blockSize / S->M;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy decimation factor number of new input samples into the state buffer */
+    i = S->M;
+
+    do
+    {
+      *pStateCurnt++ = *pSrc++;
+
+    } while (--i);
+
+    /* Set accumulator to zero */
+    sum0 = 0;
+
+		for(i=0;i<numTaps;i++)
+		{
+      /* Read coefficients */
+      c0 = pCoeffs[i];
+
+      /* Fetch 1 state variable */
+      x0 = pState[i];
+
+      /* Perform the multiply-accumulate */
+      sum0 += x0 * c0;
+		}
+
+    /* Advance the state pointer by the decimation factor           
+     * to process the next group of decimation factor number samples */
+    pState = pState + S->M;
+
+    /* The result is in the accumulator, store in the destination buffer. */
+    *pDst++ = ref_sat_q15(sum0 >> 15);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.         
+   ** Now copy the last numTaps - 1 samples to the start of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+  i = numTaps - 1U;
+
+  /* copy data */
+  while (i > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    i--;
+  }
+}
+
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_interpolate.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_interpolate.c
new file mode 100644
index 0000000..4341621
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_interpolate.c
@@ -0,0 +1,291 @@
+#include "ref.h"
+
+void ref_fir_interpolate_f32(
+  const arm_fir_interpolate_instance_f32 * S,
+        float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        float32_t *pState = S->pState;                 /* State pointer */
+  const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        float32_t *pStateCurnt;                        /* Points to the current sample of the state */
+        float32_t *ptr1;                               /* Temporary pointer for state buffer */
+  const float32_t *ptr2;                               /* Temporary pointer for coefficient buffer */
+        float32_t sum;                                 /* Accumulator */
+        uint32_t i, blkCnt;                            /* Loop counters */
+        uint16_t phaseLen = S->phaseLength, tapCnt;    /* Length of each polyphase filter component */
+
+
+  /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = S->pState + phaseLen - 1;
+
+  /* Total number of intput samples */
+  blkCnt = blockSize;
+
+  /* Loop over the blockSize. */
+  while (blkCnt > 0U)
+  {
+    /* Copy new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc++;
+
+    /* Loop over the Interpolation factor. */
+    i = S->L;
+
+    while (i > 0U)
+    {
+      /* Set accumulator to zero */
+      sum = 0.0f;
+
+      /* Initialize state pointer */
+      ptr1 = pState;
+
+      /* Initialize coefficient pointer */
+      ptr2 = pCoeffs + i - 1;
+
+      /* Loop over the polyPhase length */
+      tapCnt = phaseLen;
+
+      while (tapCnt > 0U)
+      {
+        /* Perform the multiply-accumulate */
+        sum += *ptr1++ * *ptr2;
+
+        /* Increment the coefficient pointer by interpolation factor times. */
+        ptr2 += S->L;
+
+        /* Decrement the loop counter */
+        tapCnt--;
+      }
+
+      /* The result is in the accumulator, store in the destination buffer. */
+      *pDst++ = sum;
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Advance the state pointer by 1           
+     * to process the next group of interpolation factor number samples */
+    pState = pState + 1;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.         
+   ** Now copy the last phaseLen - 1 samples to the start of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+  tapCnt = phaseLen - 1U;
+
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    tapCnt--;
+  }
+
+}
+
+void ref_fir_interpolate_q31(
+  const arm_fir_interpolate_instance_q31 * S,
+        q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        q31_t *pState = S->pState;                     /* State pointer */
+  const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        q31_t *pStateCurnt;                            /* Points to the current sample of the state */
+        q31_t *ptr1;                                   /* Temporary pointer for state buffer */
+  const q31_t *ptr2;                                   /* Temporary pointer for coefficient buffer */
+        q63_t sum;                                     /* Accumulator */
+        q31_t x0, c0;                                  /* Temporary variables to hold state and coefficient values */
+        uint32_t i, blkCnt;                            /* Loop counters */
+        uint16_t phaseLen = S->phaseLength, tapCnt;    /* Length of each polyphase filter component */
+
+
+  /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = S->pState + (q31_t)phaseLen - 1;
+
+  /* Total number of intput samples */
+  blkCnt = blockSize;
+
+  /* Loop over the blockSize. */
+  while (blkCnt > 0U)
+  {
+    /* Copy new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc++;
+
+    /* Loop over the Interpolation factor. */
+    i = S->L;
+
+    while (i > 0U)
+    {
+      /* Set accumulator to zero */
+      sum = 0;
+
+      /* Initialize state pointer */
+      ptr1 = pState;
+
+      /* Initialize coefficient pointer */
+      ptr2 = pCoeffs + i - 1;
+
+      tapCnt = phaseLen;
+
+      while (tapCnt > 0U)
+      {
+        /* Read the coefficient */
+        c0 = *(ptr2);
+
+        /* Increment the coefficient pointer by interpolation factor times. */
+        ptr2 += S->L;
+
+        /* Read the input sample */
+        x0 = *ptr1++;
+
+        /* Perform the multiply-accumulate */
+        sum += (q63_t) x0 *c0;
+
+        /* Decrement the loop counter */
+        tapCnt--;
+      }
+
+      /* The result is in the accumulator, store in the destination buffer. */
+      *pDst++ = (q31_t)(sum >> 31);
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Advance the state pointer by 1           
+     * to process the next group of interpolation factor number samples */
+    pState = pState + 1;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.         
+   ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+  tapCnt = phaseLen - 1U;
+
+  /* copy data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    tapCnt--;
+  }
+
+}
+
+void ref_fir_interpolate_q15(
+  const arm_fir_interpolate_instance_q15 * S,
+        q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        q15_t *pState = S->pState;                     /* State pointer                                            */
+  const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer                                      */
+        q15_t *pStateCurnt;                            /* Points to the current sample of the state                */
+        q15_t *ptr1;                                   /* Temporary pointer for state buffer */
+  const q15_t *ptr2;                                   /* Temporary pointer for coefficient buffer */
+        q63_t sum;                                     /* Accumulator */
+        q15_t x0, c0;                                  /* Temporary variables to hold state and coefficient values */
+        uint32_t i, blkCnt, tapCnt;                    /* Loop counters                                            */
+        uint16_t phaseLen = S->phaseLength;            /* Length of each polyphase filter component */
+
+
+  /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = S->pState + phaseLen - 1;
+
+  /* Total number of intput samples */
+  blkCnt = blockSize;
+
+  /* Loop over the blockSize. */
+  while (blkCnt > 0U)
+  {
+    /* Copy new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc++;
+
+    /* Loop over the Interpolation factor. */
+    i = S->L;
+
+    while (i > 0U)
+    {
+      /* Set accumulator to zero */
+      sum = 0;
+
+      /* Initialize state pointer */
+      ptr1 = pState;
+
+      /* Initialize coefficient pointer */
+      ptr2 = pCoeffs + i - 1;
+
+      /* Loop over the polyPhase length */
+      tapCnt = (uint32_t)phaseLen;
+
+      while (tapCnt > 0U)
+      {
+        /* Read the coefficient */
+        c0 = *ptr2;
+
+        /* Increment the coefficient pointer by interpolation factor times. */
+        ptr2 += S->L;
+
+        /* Read the input sample */
+        x0 = *ptr1++;
+
+        /* Perform the multiply-accumulate */
+        sum += (q31_t) x0 * c0;
+
+        /* Decrement the loop counter */
+        tapCnt--;
+      }
+
+      /* Store the result after converting to 1.15 format in the destination buffer */
+      *pDst++ = ref_sat_q15(sum >> 15);
+
+      /* Decrement the loop counter */
+      i--;
+    }
+
+    /* Advance the state pointer by 1           
+     * to process the next group of interpolation factor number samples */
+    pState = pState + 1;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.         
+   ** Now copy the last phaseLen - 1 samples to the start of the state buffer.       
+   ** This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+  i = (uint32_t) phaseLen - 1U;
+
+  while (i > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    i--;
+  }
+
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_lattice.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_lattice.c
new file mode 100644
index 0000000..d04d560
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_lattice.c
@@ -0,0 +1,241 @@
+#include "ref.h"
+
+void ref_fir_lattice_f32(
+  const arm_fir_lattice_instance_f32 * S,
+        float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        float32_t *pState;                             /* State pointer */
+  const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        float32_t *px;                                 /* temporary state pointer */
+  const float32_t *pk;                                 /* temporary coefficient pointer */
+        float32_t fcurr, fnext, gcurr, gnext;          /* temporary variables */
+        uint32_t numStages = S->numStages;             /* Length of the filter */
+        uint32_t blkCnt, stageCnt;                     /* temporary variables for counts */
+
+  pState = &S->pState[0];
+
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* f0(n) = x(n) */
+    fcurr = *pSrc++;
+
+    /* Initialize coeff pointer */
+    pk = pCoeffs;
+
+    /* Initialize state pointer */
+    px = pState;
+
+    /* read g0(n-1) from state buffer */
+    gcurr = *px;
+
+    /* for sample 1 processing */
+    /* f1(n) = f0(n) +  K1 * g0(n-1) */
+    fnext = fcurr + ((*pk) * gcurr);
+    /* g1(n) = f0(n) * K1  +  g0(n-1) */
+    gnext = (fcurr * (*pk++)) + gcurr;
+
+    /* save f0(n) in state buffer */
+    *px++ = fcurr;
+
+    /* f1(n) is saved in fcurr            
+       for next stage processing */
+    fcurr = fnext;
+
+    stageCnt = (numStages - 1U);
+
+    /* stage loop */
+    while (stageCnt > 0U)
+    {
+      /* read g2(n) from state buffer */
+      gcurr = *px;
+
+      /* save g1(n) in state buffer */
+      *px++ = gnext;
+
+      /* Sample processing for K2, K3.... */
+      /* f2(n) = f1(n) +  K2 * g1(n-1) */
+      fnext = fcurr + ((*pk) * gcurr);
+      /* g2(n) = f1(n) * K2  +  g1(n-1) */
+      gnext = (fcurr * (*pk++)) + gcurr;
+
+      /* f1(n) is saved in fcurr1            
+         for next stage processing */
+      fcurr = fnext;
+
+      stageCnt--;
+    }
+
+    /* y(n) = fN(n) */
+    *pDst++ = fcurr;
+
+    blkCnt--;
+  }
+}
+
+void ref_fir_lattice_q31(
+  const arm_fir_lattice_instance_q31 * S,
+        q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        q31_t *pState;                                 /* State pointer */
+  const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        q31_t *px;                                     /* temporary state pointer */
+  const q31_t *pk;                                     /* temporary coefficient pointer */
+        q31_t fcurr, fnext, gcurr, gnext;              /* temporary variables */
+        uint32_t numStages = S->numStages;             /* Length of the filter */
+        uint32_t blkCnt, stageCnt;                     /* temporary variables for counts */
+
+  pState = &S->pState[0];
+
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* f0(n) = x(n) */
+    fcurr = *pSrc++;
+
+    /* Initialize coeff pointer */
+    pk = pCoeffs;
+
+    /* Initialize state pointer */
+    px = pState;
+
+    /* read g0(n-1) from state buffer */
+    gcurr = *px;
+
+    /* for sample 1 processing */
+    /* f1(n) = f0(n) +  K1 * g0(n-1) */
+    fnext = (q31_t) (((q63_t) gcurr * (*pk)) >> 31) + fcurr;
+    /* g1(n) = f0(n) * K1  +  g0(n-1) */
+    gnext = (q31_t) (((q63_t) fcurr * (*pk++)) >> 31) + gcurr;
+    /* save g1(n) in state buffer */
+    *px++ = fcurr;
+
+    /* f1(n) is saved in fcurr1            
+       for next stage processing */
+    fcurr = fnext;
+
+    stageCnt = (numStages - 1U);
+
+    /* stage loop */
+    while (stageCnt > 0U)
+    {
+      /* read g2(n) from state buffer */
+      gcurr = *px;
+
+      /* save g1(n) in state buffer */
+      *px++ = gnext;
+
+      /* Sample processing for K2, K3.... */
+      /* f2(n) = f1(n) +  K2 * g1(n-1) */
+      fnext = (q31_t) (((q63_t) gcurr * (*pk)) >> 31) + fcurr;
+      /* g2(n) = f1(n) * K2  +  g1(n-1) */
+      gnext = (q31_t) (((q63_t) fcurr * (*pk++)) >> 31) + gcurr;
+
+      /* f1(n) is saved in fcurr1            
+         for next stage processing */
+      fcurr = fnext;
+
+      stageCnt--;
+
+    }
+
+    /* y(n) = fN(n) */
+    *pDst++ = fcurr;
+
+    blkCnt--;
+
+  }
+}
+
+void ref_fir_lattice_q15(
+  const arm_fir_lattice_instance_q15 * S,
+        q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        q15_t *pState;                                 /* State pointer */
+  const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        q15_t *px;                                     /* temporary state pointer */
+  const q15_t *pk;                                     /* temporary coefficient pointer */
+        q31_t fcurnt, fnext, gcurnt, gnext;            /* temporary variables */
+        uint32_t numStages = S->numStages;             /* Length of the filter */
+        uint32_t blkCnt, stageCnt;                     /* temporary variables for counts */
+
+  pState = &S->pState[0];
+
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* f0(n) = x(n) */
+    fcurnt = *pSrc++;
+
+    /* Initialize coeff pointer */
+    pk = (pCoeffs);
+
+    /* Initialize state pointer */
+    px = pState;
+
+    /* read g0(n-1) from state buffer */
+    gcurnt = *px;
+
+    /* for sample 1 processing */
+    /* f1(n) = f0(n) +  K1 * g0(n-1) */
+    fnext = ((gcurnt * (*pk)) >> 15U) + fcurnt;
+    fnext = ref_sat_q15(fnext);
+
+
+    /* g1(n) = f0(n) * K1  +  g0(n-1) */
+    gnext = ((fcurnt * (*pk++)) >> 15U) + gcurnt;
+    gnext = ref_sat_q15(gnext);
+
+    /* save f0(n) in state buffer */
+    *px++ = (q15_t) fcurnt;
+
+    /* f1(n) is saved in fcurnt            
+       for next stage processing */
+    fcurnt = fnext;
+
+    stageCnt = (numStages - 1U);
+
+    /* stage loop */
+    while (stageCnt > 0U)
+    {
+      /* read g1(n-1) from state buffer */
+      gcurnt = *px;
+
+      /* save g0(n-1) in state buffer */
+      *px++ = (q15_t) gnext;
+
+      /* Sample processing for K2, K3.... */
+      /* f2(n) = f1(n) +  K2 * g1(n-1) */
+      fnext = ((gcurnt * (*pk)) >> 15U) + fcurnt;
+      fnext = ref_sat_q15(fnext);
+
+      /* g2(n) = f1(n) * K2  +  g1(n-1) */
+      gnext = ((fcurnt * (*pk++)) >> 15U) + gcurnt;
+      gnext = ref_sat_q15(gnext);
+
+
+      /* f1(n) is saved in fcurnt            
+         for next stage processing */
+      fcurnt = fnext;
+
+      stageCnt--;
+
+    }
+
+    /* y(n) = fN(n) */
+    *pDst++ = ref_sat_q15(fcurnt);
+
+
+    blkCnt--;
+
+  }
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_sparse.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_sparse.c
new file mode 100644
index 0000000..73f074c
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/fir_sparse.c
@@ -0,0 +1,485 @@
+#include "ref.h"
+
+void ref_fir_sparse_f32(
+        arm_fir_sparse_instance_f32 * S,
+        float32_t * pSrc,
+        float32_t * pDst,
+        float32_t * pScratchIn,
+        uint32_t blockSize)
+{
+        float32_t *pState = S->pState;                 /* State pointer */
+  const float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        float32_t *px;                                 /* Scratch buffer pointer */
+        float32_t *py = pState;                        /* Temporary pointers for state buffer */
+        float32_t *pb = pScratchIn;                    /* Temporary pointers for scratch buffer */
+        float32_t *pOut;                               /* Destination pointer */
+        int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */
+        uint32_t delaySize = S->maxDelay + blockSize;  /* state length */
+        uint16_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter  */
+        int32_t readIndex;                             /* Read index of the state buffer */
+        uint32_t tapCnt, blkCnt;                       /* loop counters */
+        float32_t coeff = *pCoeffs++;                  /* Read the first coefficient value */
+
+
+  /* BlockSize of Input samples are copied into the state buffer */
+  /* StateIndex points to the starting position to write in the state buffer */
+  arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1,
+                        (int32_t *) pSrc, 1, blockSize);
+
+
+  /* Read Index, from where the state buffer should be read, is calculated. */
+  readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+
+  /* Wraparound of readIndex */
+  if (readIndex < 0)
+  {
+    readIndex += (int32_t) delaySize;
+  }
+
+  /* Working pointer for state buffer is updated */
+  py = pState;
+
+  /* blockSize samples are read from the state buffer */
+  arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+                       (int32_t *) pb, (int32_t *) pb, blockSize, 1,
+                       blockSize);
+
+  /* Working pointer for the scratch buffer */
+  px = pb;
+
+  /* Working pointer for destination buffer */
+  pOut = pDst;
+
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* Perform Multiplications and store in destination buffer */
+    *pOut++ = *px++ * coeff;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Loop over the number of taps. */
+  tapCnt = (uint32_t) numTaps - 1U;
+
+  while (tapCnt > 0U)
+  {
+    /* Load the coefficient value and
+     * increment the coefficient buffer for the next set of state values */
+    coeff = *pCoeffs++;
+
+    /* Read Index, from where the state buffer should be read, is calculated. */
+    readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+
+    /* Wraparound of readIndex */
+    if (readIndex < 0)
+    {
+      readIndex += (int32_t) delaySize;
+    }
+
+    /* Working pointer for state buffer is updated */
+    py = pState;
+
+    /* blockSize samples are read from the state buffer */
+    arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+                         (int32_t *) pb, (int32_t *) pb, blockSize, 1,
+                         blockSize);
+
+    /* Working pointer for the scratch buffer */
+    px = pb;
+
+    /* Working pointer for destination buffer */
+    pOut = pDst;
+
+    blkCnt = blockSize;
+
+    while (blkCnt > 0U)
+    {
+      /* Perform Multiply-Accumulate */
+      *pOut++ += *px++ * coeff;
+
+      /* Decrement the loop counter */
+      blkCnt--;
+    }
+
+    /* Decrement the tap loop counter */
+    tapCnt--;
+  }
+}
+
+void ref_fir_sparse_q31(
+        arm_fir_sparse_instance_q31 * S,
+        q31_t * pSrc,
+        q31_t * pDst,
+        q31_t * pScratchIn,
+        uint32_t blockSize)
+{
+        q31_t *pState = S->pState;                     /* State pointer */
+  const q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        q31_t *px;                                     /* Scratch buffer pointer */
+        q31_t *py = pState;                            /* Temporary pointers for state buffer */
+        q31_t *pb = pScratchIn;                        /* Temporary pointers for scratch buffer */
+        q31_t *pOut;                                   /* Destination pointer */
+        q63_t out;                                     /* Temporary output variable */
+        int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */
+        uint32_t delaySize = S->maxDelay + blockSize;  /* state length */
+        uint16_t numTaps = S->numTaps;                 /* Filter order */
+        int32_t readIndex;                             /* Read index of the state buffer */
+        uint32_t tapCnt, blkCnt;                       /* loop counters */
+        q31_t coeff = *pCoeffs++;                      /* Read the first coefficient value */
+        q31_t in;
+
+
+  /* BlockSize of Input samples are copied into the state buffer */
+  /* StateIndex points to the starting position to write in the state buffer */
+  arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1,
+                        (int32_t *) pSrc, 1, blockSize);
+
+  /* Read Index, from where the state buffer should be read, is calculated. */
+  readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+  /* Wraparound of readIndex */
+  if (readIndex < 0)
+  {
+    readIndex += (int32_t) delaySize;
+  }
+
+  /* Working pointer for state buffer is updated */
+  py = pState;
+
+  /* blockSize samples are read from the state buffer */
+  arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+                       (int32_t *) pb, (int32_t *) pb, blockSize, 1,
+                       blockSize);
+
+  /* Working pointer for the scratch buffer of state values */
+  px = pb;
+
+  /* Working pointer for scratch buffer of output values */
+  pOut = pDst;
+  
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* Perform Multiplications and store in the destination buffer */
+    *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Loop over the number of taps. */
+  tapCnt = (uint32_t) numTaps - 1U;
+
+  while (tapCnt > 0U)
+  {
+    /* Load the coefficient value and           
+     * increment the coefficient buffer for the next set of state values */
+    coeff = *pCoeffs++;
+
+    /* Read Index, from where the state buffer should be read, is calculated. */
+    readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+    /* Wraparound of readIndex */
+    if (readIndex < 0)
+    {
+      readIndex += (int32_t) delaySize;
+    }
+
+    /* Working pointer for state buffer is updated */
+    py = pState;
+
+    /* blockSize samples are read from the state buffer */
+    arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+                         (int32_t *) pb, (int32_t *) pb, blockSize, 1,
+                         blockSize);
+
+    /* Working pointer for the scratch buffer of state values */
+    px = pb;
+
+    /* Working pointer for scratch buffer of output values */
+    pOut = pDst;
+
+    blkCnt = blockSize;
+
+    while (blkCnt > 0U)
+    {
+      /* Perform Multiply-Accumulate */
+      out = *pOut;
+      out += ((q63_t) * px++ * coeff) >> 32;
+      *pOut++ = (q31_t) (out);
+
+      /* Decrement the loop counter */
+      blkCnt--;
+    }
+
+    /* Decrement the tap loop counter */
+    tapCnt--;
+  }
+
+  /* Working output pointer is updated */
+  pOut = pDst;
+
+  /* Output is converted into 1.31 format. */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    in = *pOut << 1;
+    *pOut++ = in;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+}
+
+void ref_fir_sparse_q15(
+        arm_fir_sparse_instance_q15 * S,
+        q15_t * pSrc,
+        q15_t * pDst,
+        q15_t * pScratchIn,
+        q31_t * pScratchOut,
+        uint32_t blockSize)
+{
+        q15_t *pState = S->pState;                     /* State pointer */
+  const q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+        q15_t *pIn = pSrc;                             /* Working pointer for input */
+        q15_t *pOut = pDst;                            /* Working pointer for output */
+        q15_t *px;                                     /* Temporary pointers for scratch buffer */
+        q15_t *pb = pScratchIn;                        /* Temporary pointers for scratch buffer */
+        q15_t *py = pState;                            /* Temporary pointers for state buffer */
+        int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */
+        uint32_t delaySize = S->maxDelay + blockSize;  /* state length */
+        uint16_t numTaps = S->numTaps;                 /* Filter order */
+        int32_t readIndex;                             /* Read index of the state buffer */
+        uint32_t tapCnt, blkCnt;                       /* loop counters */
+        q15_t coeff = *pCoeffs++;                      /* Read the first coefficient value */
+        q31_t *pScr2 = pScratchOut;                    /* Working pointer for pScratchOut */
+
+  /* BlockSize of Input samples are copied into the state buffer */
+  /* StateIndex points to the starting position to write in the state buffer */
+  arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize);
+
+  /* Loop over the number of taps. */
+  tapCnt = numTaps;
+
+  /* Read Index, from where the state buffer should be read, is calculated. */
+  readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
+
+  /* Wraparound of readIndex */
+  if (readIndex < 0)
+  {
+    readIndex += (int32_t) delaySize;
+  }
+
+  /* Working pointer for state buffer is updated */
+  py = pState;
+
+  /* blockSize samples are read from the state buffer */
+  arm_circularRead_q15(py, delaySize, &readIndex, 1,
+                       pb, pb, blockSize, 1, blockSize);
+
+  /* Working pointer for the scratch buffer of state values */
+  px = pb;
+
+  /* Working pointer for scratch buffer of output values */
+  pScratchOut = pScr2;
+
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* Perform multiplication and store in the scratch buffer */
+    *pScratchOut++ = ((q31_t) * px++ * coeff);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Loop over the number of taps. */
+  tapCnt = (uint32_t) numTaps - 1U;
+
+  while (tapCnt > 0U)
+  {
+    /* Load the coefficient value and           
+     * increment the coefficient buffer for the next set of state values */
+    coeff = *pCoeffs++;
+
+    /* Read Index, from where the state buffer should be read, is calculated. */
+    readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
+
+    /* Wraparound of readIndex */
+    if (readIndex < 0)
+    {
+      readIndex += (int32_t) delaySize;
+    }
+
+    /* Working pointer for state buffer is updated */
+    py = pState;
+
+    /* blockSize samples are read from the state buffer */
+    arm_circularRead_q15(py, delaySize, &readIndex, 1,
+                         pb, pb, blockSize, 1, blockSize);
+
+    /* Working pointer for the scratch buffer of state values */
+    px = pb;
+
+    /* Working pointer for scratch buffer of output values */
+    pScratchOut = pScr2;
+
+    blkCnt = blockSize;
+
+    while (blkCnt > 0U)
+    {
+      /* Perform Multiply-Accumulate */
+      *pScratchOut++ += (q31_t) * px++ * coeff;
+
+      /* Decrement the loop counter */
+      blkCnt--;
+    }
+
+    /* Decrement the tap loop counter */
+    tapCnt--;
+  }
+
+  /* All the output values are in pScratchOut buffer.       
+     Convert them into 1.15 format, saturate and store in the destination buffer. */
+  /* Loop over the blockSize. */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16);
+    blkCnt--;
+  }
+}
+
+void ref_fir_sparse_q7(
+        arm_fir_sparse_instance_q7 * S,
+        q7_t *pSrc,
+        q7_t *pDst,
+        q7_t *pScratchIn,
+        q31_t * pScratchOut,
+        uint32_t blockSize)
+{
+        q7_t *pState = S->pState;                      /* State pointer */
+  const q7_t *pCoeffs = S->pCoeffs;                    /* Coefficient pointer */
+        q7_t *px;                                      /* Scratch buffer pointer */
+        q7_t *py = pState;                             /* Temporary pointers for state buffer */
+        q7_t *pb = pScratchIn;                         /* Temporary pointers for scratch buffer */
+        q7_t *pOut = pDst;                             /* Destination pointer */
+        int32_t *pTapDelay = S->pTapDelay;             /* Pointer to the array containing offset of the non-zero tap values. */
+        uint32_t delaySize = S->maxDelay + blockSize;  /* state length */
+        uint16_t numTaps = S->numTaps;                 /* Filter order */
+        int32_t readIndex;                             /* Read index of the state buffer */
+        uint32_t tapCnt, blkCnt;                       /* loop counters */
+        q7_t coeff = *pCoeffs++;                       /* Read the coefficient value */
+        q31_t *pScr2 = pScratchOut;                    /* Working pointer for scratch buffer of output values */
+        q31_t in;
+
+  /* BlockSize of Input samples are copied into the state buffer */
+  /* StateIndex points to the starting position to write in the state buffer */
+  arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
+                       blockSize);
+
+  /* Loop over the number of taps. */
+  tapCnt = numTaps;
+
+  /* Read Index, from where the state buffer should be read, is calculated. */
+  readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+
+  /* Wraparound of readIndex */
+  if (readIndex < 0)
+  {
+    readIndex += (int32_t) delaySize;
+  }
+
+  /* Working pointer for state buffer is updated */
+  py = pState;
+
+  /* blockSize samples are read from the state buffer */
+  arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
+                      (int32_t) blockSize, 1, blockSize);
+
+  /* Working pointer for the scratch buffer of state values */
+  px = pb;
+
+  /* Working pointer for scratch buffer of output values */
+  pScratchOut = pScr2;
+
+  /* Loop over the blockSize */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* Perform multiplication and store in the scratch buffer */
+    *pScratchOut++ = ((q31_t) * px++ * coeff);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Loop over the number of taps. */
+  tapCnt = (uint32_t) numTaps - 1U;
+
+  while (tapCnt > 0U)
+  {
+    /* Load the coefficient value and           
+     * increment the coefficient buffer for the next set of state values */
+    coeff = *pCoeffs++;
+
+    /* Read Index, from where the state buffer should be read, is calculated. */
+    readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+
+    /* Wraparound of readIndex */
+    if (readIndex < 0)
+    {
+      readIndex += (int32_t) delaySize;
+    }
+
+    /* Working pointer for state buffer is updated */
+    py = pState;
+
+    /* blockSize samples are read from the state buffer */
+    arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
+                        (int32_t) blockSize, 1, blockSize);
+
+    /* Working pointer for the scratch buffer of state values */
+    px = pb;
+
+    /* Working pointer for scratch buffer of output values */
+    pScratchOut = pScr2;
+
+    /* Loop over the blockSize */
+    blkCnt = blockSize;
+
+    while (blkCnt > 0U)
+    {
+      /* Perform Multiply-Accumulate */
+      in = *pScratchOut + ((q31_t) * px++ * coeff);
+      *pScratchOut++ = in;
+
+      /* Decrement the loop counter */
+      blkCnt--;
+    }
+
+    /* Decrement the tap loop counter */
+    tapCnt--;
+  }
+
+  /* All the output values are in pScratchOut buffer.       
+     Convert them into 1.15 format, saturate and store in the destination buffer. */
+  /* Loop over the blockSize. */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
+
+    /* Decrement the blockSize loop counter */
+    blkCnt--;
+  }
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/iir_lattice.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/iir_lattice.c
new file mode 100644
index 0000000..ab37d5f
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/iir_lattice.c
@@ -0,0 +1,271 @@
+#include "ref.h"
+
+void ref_iir_lattice_f32(
+  const arm_iir_lattice_instance_f32 * S,
+  float32_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+  float32_t fcurr, fnext = 0, gcurr, gnext;      /* Temporary variables for lattice stages */
+  float32_t acc;                                 /* Accumlator */
+  uint32_t blkCnt, tapCnt;                       /* temporary variables for counts */
+  float32_t *px1, *px2, *pk, *pv;                /* temporary pointers for state and coef */
+  uint32_t numStages = S->numStages;             /* number of stages */
+  float32_t *pState;                             /* State pointer */
+  float32_t *pStateCurnt;                        /* State current pointer */
+
+  blkCnt = blockSize;
+  pState = &S->pState[0];
+
+  /* Sample processing */
+  while (blkCnt > 0U)
+  {
+    /* Read Sample from input buffer */
+    /* fN(n) = x(n) */
+    fcurr = *pSrc++;
+
+    /* Initialize state read pointer */
+    px1 = pState;
+    /* Initialize state write pointer */
+    px2 = pState;
+    /* Set accumulator to zero */
+    acc = 0.0f;
+    /* Initialize Ladder coeff pointer */
+    pv = &S->pvCoeffs[0];
+    /* Initialize Reflection coeff pointer */
+    pk = &S->pkCoeffs[0];
+
+    /* Process sample for numStages */
+    tapCnt = numStages;
+
+    while (tapCnt > 0U)
+    {
+      gcurr = *px1++;
+      /* Process sample for last taps */
+      fnext = fcurr - (*pk) * gcurr;
+      gnext = fnext * (*pk++) + gcurr;
+
+      /* Output samples for last taps */
+      acc += gnext * (*pv++);
+      *px2++ = gnext;
+      fcurr = fnext;
+
+      /* Decrementing loop counter */
+      tapCnt--;
+    }
+
+    /* y(n) += g0(n) * v0 */
+    acc += fnext * (*pv);
+
+    *px2++ = fnext;
+
+    /* write out into pDst */
+    *pDst++ = acc;
+
+    /* Advance the state pointer by 1 to process the next group of samples */
+    pState = pState + 1U;
+    blkCnt--;
+  }
+
+  /* Processing is complete. Now copy last S->numStages samples to start of the buffer           
+     for the preperation of next frame process */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = &S->pState[0];
+  pState = &S->pState[blockSize];
+
+  tapCnt = numStages;
+
+  /* Copy the data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    tapCnt--;
+  }
+}
+
+void ref_iir_lattice_q31(
+  const arm_iir_lattice_instance_q31 * S,
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+  q31_t fcurr, fnext = 0, gcurr = 0, gnext;      /* Temporary variables for lattice stages */
+  q63_t acc;                                     /* Accumlator */
+  uint32_t blkCnt, tapCnt;                       /* Temporary variables for counts */
+  q31_t *px1, *px2, *pk, *pv;                    /* Temporary pointers for state and coef */
+  uint32_t numStages = S->numStages;             /* number of stages */
+  q31_t *pState;                                 /* State pointer */
+  q31_t *pStateCurnt;                            /* State current pointer */
+
+  blkCnt = blockSize;
+  pState = &S->pState[0];
+
+  /* Sample processing */
+  while (blkCnt > 0U)
+  {
+    /* Read Sample from input buffer */
+    /* fN(n) = x(n) */
+    fcurr = *pSrc++;
+
+    /* Initialize state read pointer */
+    px1 = pState;
+    /* Initialize state write pointer */
+    px2 = pState;
+    /* Set accumulator to zero */
+    acc = 0;
+    /* Initialize Ladder coeff pointer */
+    pv = &S->pvCoeffs[0];
+    /* Initialize Reflection coeff pointer */
+    pk = &S->pkCoeffs[0];
+
+    tapCnt = numStages;
+
+    while (tapCnt > 0U)
+    {
+      gcurr = *px1++;
+      /* Process sample */
+      /* fN-1(n) = fN(n) - kN * gN-1(n-1) */
+      fnext =
+        ref_sat_q31(((q63_t) fcurr -
+                         ((q31_t) (((q63_t) gcurr * (*pk)) >> 31))));
+      /* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+      gnext =
+        ref_sat_q31(((q63_t) gcurr +
+                         ((q31_t) (((q63_t) fnext * (*pk++)) >> 31))));
+      /* Output samples */
+      /* y(n) += gN(n) * vN  */
+      acc += ((q63_t) gnext * *pv++);
+      /* write gN-1(n-1) into state for next sample processing */
+      *px2++ = gnext;
+      /* Update f values for next coefficient processing */
+      fcurr = fnext;
+
+      tapCnt--;
+    }
+
+    /* y(n) += g0(n) * v0 */
+    acc += (q63_t) fnext *(*pv++);
+
+    *px2++ = fnext;
+
+    /* write out into pDst */
+    *pDst++ = (q31_t) (acc >> 31U);
+
+    /* Advance the state pointer by 1 to process the next group of samples */
+    pState = pState + 1U;
+    blkCnt--;
+  }
+
+  /* Processing is complete. Now copy last S->numStages samples to start of the buffer           
+     for the preperation of next frame process */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = &S->pState[0];
+  pState = &S->pState[blockSize];
+
+  tapCnt = numStages;
+
+  /* Copy the remaining q31_t data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    tapCnt--;
+  }
+}
+
+void ref_iir_lattice_q15(
+  const arm_iir_lattice_instance_q15 * S,
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+  q31_t fcurr, fnext = 0, gcurr = 0, gnext;      /* Temporary variables for lattice stages */
+  uint32_t stgCnt;                               /* Temporary variables for counts */
+  q63_t acc;                                     /* Accumlator */
+  uint32_t blkCnt, tapCnt;                       /* Temporary variables for counts */
+  q15_t *px1, *px2, *pk, *pv;                    /* temporary pointers for state and coef */
+  uint32_t numStages = S->numStages;             /* number of stages */
+  q15_t *pState;                                 /* State pointer */
+  q15_t *pStateCurnt;                            /* State current pointer */
+  q15_t out;                                     /* Temporary variable for output */
+
+  blkCnt = blockSize;
+  pState = &S->pState[0];
+
+  /* Sample processing */
+  while (blkCnt > 0U)
+  {
+    /* Read Sample from input buffer */
+    /* fN(n) = x(n) */
+    fcurr = *pSrc++;
+
+    /* Initialize state read pointer */
+    px1 = pState;
+    /* Initialize state write pointer */
+    px2 = pState;
+    /* Set accumulator to zero */
+    acc = 0;
+    /* Initialize Ladder coeff pointer */
+    pv = &S->pvCoeffs[0];
+    /* Initialize Reflection coeff pointer */
+    pk = &S->pkCoeffs[0];
+
+    tapCnt = numStages;
+
+    while (tapCnt > 0U)
+    {
+      gcurr = *px1++;
+      /* Process sample */
+      /* fN-1(n) = fN(n) - kN * gN-1(n-1) */
+      fnext = fcurr - ((gcurr * (*pk)) >> 15);
+      fnext = ref_sat_q15(fnext);
+      /* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+      gnext = ((fnext * (*pk++)) >> 15) + gcurr;
+      gnext = ref_sat_q15(gnext);
+      /* Output samples */
+      /* y(n) += gN(n) * vN */
+      acc += (q31_t) ((gnext * (*pv++)));
+      /* write gN(n) into state for next sample processing */
+      *px2++ = (q15_t) gnext;
+      /* Update f values for next coefficient processing */
+      fcurr = fnext;
+
+      tapCnt--;
+    }
+
+    /* y(n) += g0(n) * v0 */
+    acc += (q31_t) ((fnext * (*pv++)));
+
+    out = ref_sat_q15(acc >> 15);
+    *px2++ = (q15_t) fnext;
+
+    /* write out into pDst */
+    *pDst++ = out;
+
+    /* Advance the state pointer by 1 to process the next group of samples */
+    pState = pState + 1U;
+    blkCnt--;
+  }
+
+  /* Processing is complete. Now copy last S->numStages samples to start of the buffer           
+     for the preperation of next frame process */
+  /* Points to the start of the state buffer */
+  pStateCurnt = &S->pState[0];
+  pState = &S->pState[blockSize];
+
+  stgCnt = numStages;
+
+  /* copy data */
+  while (stgCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    stgCnt--;
+  }
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/lms.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/lms.c
new file mode 100644
index 0000000..fee99f9
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/FilteringFunctions/lms.c
@@ -0,0 +1,695 @@
+#include "ref.h"
+
+void ref_lms_f32(
+  const arm_lms_instance_f32 * S,
+  float32_t * pSrc,
+  float32_t * pRef,
+  float32_t * pOut,
+  float32_t * pErr,
+  uint32_t blockSize)
+{
+  float32_t *pState = S->pState;                 /* State pointer */
+  float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+  float32_t *pStateCurnt;                        /* Points to the current sample of the state */
+  float32_t mu = S->mu;                          /* Adaptive factor */
+  uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+  uint32_t i, blkCnt;                     			 /* Loop counters */
+  float32_t sum, e, d;                           /* accumulator, error, reference data sample */
+  float32_t w = 0.0f;                            /* weight factor */
+
+  e = 0.0f;
+  d = 0.0f;
+
+  /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = &(S->pState[numTaps - 1U]);
+
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy the new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc++;
+
+    /* Set the accumulator to zero */
+    sum = 0.0f;
+		
+		for(i=0;i<numTaps;i++)
+		{ /* Perform the multiply-accumulate */
+      sum += pState[i] * pCoeffs[i];
+		}
+
+    /* The result is stored in the destination buffer. */
+    *pOut++ = sum;
+
+    /* Compute and store error */
+    d = *pRef++;
+    e = d - sum;
+    *pErr++ = e;
+
+    /* Weighting factor for the LMS version */
+    w = e * mu;
+		
+		for(i=0;i<numTaps;i++)
+		{ /* Perform the multiply-accumulate */
+      pCoeffs[i] += w * pState[i];
+		}
+
+    /* Advance state pointer by 1 for the next sample */
+    pState++;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete. Now copy the last numTaps - 1 samples to the        
+   * start of the state buffer. This prepares the state buffer for the        
+   * next function call. */
+	for(i=0;i<numTaps-1;i++)
+  {
+    S->pState[i] = pState[i];
+  }
+}
+
+void ref_lms_norm_f32(
+  arm_lms_norm_instance_f32 * S,
+  float32_t * pSrc,
+  float32_t * pRef,
+  float32_t * pOut,
+  float32_t * pErr,
+  uint32_t blockSize)
+{
+  float32_t *pState = S->pState;                 /* State pointer */
+  float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+  float32_t *pStateCurnt;                        /* Points to the current sample of the state */
+  float32_t mu = S->mu;                          /* Adaptive factor */
+  uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+  uint32_t i, blkCnt;                    				 /* Loop counters */
+  float32_t energy;                              /* Energy of the input */
+  float32_t sum, e, d;                           /* accumulator, error, reference data sample */
+  float32_t w, x0, in;                           /* weight factor, temporary variable to hold input sample and state */
+
+  /* Initializations of error,  difference, Coefficient update */
+  e = 0.0f;
+  d = 0.0f;
+  w = 0.0f;
+
+  energy = S->energy;
+  x0 = S->x0;
+
+  /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = &(S->pState[numTaps - 1U]);
+
+  for(blkCnt = blockSize; blkCnt > 0U; blkCnt--)
+  {
+    /* Copy the new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc;
+
+    /* Read the sample from input buffer */
+    in = *pSrc++;
+
+    /* Update the energy calculation */
+    energy -= x0 * x0;
+    energy += in * in;
+
+    /* Set the accumulator to zero */
+    sum = 0.0f;
+		
+		for(i=0;i<numTaps;i++)
+		{ /* Perform the multiply-accumulate */
+      sum += pState[i] * pCoeffs[i];
+		}
+
+    /* The result in the accumulator is stored in the destination buffer. */
+    *pOut++ = sum;
+
+    /* Compute and store error */
+    d = *pRef++;
+    e = d - sum;
+    *pErr++ = e;
+
+    /* Calculation of Weighting factor for updating filter coefficients */
+    /* epsilon value 0.000000119209289f */
+    w = e * mu / (energy + 0.000000119209289f);
+
+		for(i=0;i<numTaps;i++)
+    {
+      /* Perform the multiply-accumulate */
+      pCoeffs[i] += w * pState[i];
+    }
+
+    x0 = *pState;
+
+    /* Advance state pointer by 1 for the next sample */
+    pState++;
+  }
+
+  S->energy = energy;
+  S->x0 = x0;
+
+  /* Processing is complete. Now copy the last numTaps - 1 samples to the        
+   * start of the state buffer. This prepares the state buffer for the        
+   * next function call. */
+	for(i=0;i<numTaps-1;i++)
+  {
+    S->pState[i] = pState[i];
+  }
+}
+
+void ref_lms_q31(
+  const arm_lms_instance_q31 * S,
+  q31_t * pSrc,
+  q31_t * pRef,
+  q31_t * pOut,
+  q31_t * pErr,
+  uint32_t blockSize)
+{
+  q31_t *pState = S->pState;                     /* State pointer */
+  uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+  q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+  q31_t *pStateCurnt;                            /* Points to the current sample of the state */
+  q31_t mu = S->mu;                              /* Adaptive factor */
+  q31_t *px;                                     /* Temporary pointer for state */
+  q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
+  uint32_t tapCnt, blkCnt;                       /* Loop counters */
+  q63_t acc;                                     /* Accumulator */
+  q31_t e = 0;                                   /* error of data sample */
+  q31_t alpha;                                   /* Intermediate constant for taps update */
+  q31_t coef;                                    /* Temporary variable for coef */
+  q31_t acc_l, acc_h;                            /*  temporary input */
+  uint32_t uShift = (uint32_t)S->postShift + 1;
+  uint32_t lShift = 32U - uShift;                /*  Shift to be applied to the output */
+
+  /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+  for(blkCnt = blockSize; blkCnt > 0U; blkCnt--)
+  {
+    /* Copy the new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc++;
+
+    /* Initialize pState pointer */
+    px = pState;
+
+    /* Initialize pCoeffs pointer */
+    pb = pCoeffs;
+
+    /* Set the accumulator to zero */
+    acc = 0;
+
+    /* Loop over numTaps number of values */
+    tapCnt = numTaps;
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      acc += (q63_t)(*px++) * (*pb++);
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+
+    /* Converting the result to 1.31 format */
+    /* Store the result from accumulator into the destination buffer. */
+    /* Calc lower part of acc */
+    acc_l = acc & 0xffffffff;
+
+    /* Calc upper part of acc */
+    acc_h = (acc >> 32) & 0xffffffff;
+
+    acc = (uint32_t)acc_l >> lShift | acc_h << uShift;
+
+    *pOut++ = (q31_t)acc;
+
+    /* Compute and store error */
+    e = *pRef++ - (q31_t)acc;
+
+    *pErr++ = (q31_t)e;
+
+    /* Weighting factor for the LMS version */
+    alpha = (q31_t)(((q63_t)e * mu) >> 31);
+
+    /* Initialize pState pointer */
+    /* Advance state pointer by 1 for the next sample */
+    px = pState++;
+
+    /* Initialize pCoeffs pointer */
+    pb = pCoeffs;
+
+    /* Loop over numTaps number of values */
+    tapCnt = numTaps;
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      coef = (q31_t)(((q63_t) alpha * (*px++)) >> 32);
+      *pb = ref_sat_q31((q63_t)*pb + (coef << 1));
+      pb++;
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+  }
+
+  /* Processing is complete. Now copy the last numTaps - 1 samples to the     
+     start of the state buffer. This prepares the state buffer for the   
+     next function call. */
+
+  /* Points to the start of the pState buffer */
+  pStateCurnt = S->pState;
+
+  /*  Copy (numTaps - 1U) samples  */
+  tapCnt = numTaps - 1;
+
+  /* Copy the data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    tapCnt--;
+  }
+}
+
+void ref_lms_norm_q31(
+  arm_lms_norm_instance_q31 * S,
+  q31_t * pSrc,
+  q31_t * pRef,
+  q31_t * pOut,
+  q31_t * pErr,
+  uint32_t blockSize)
+{
+  q31_t *pState = S->pState;                     /* State pointer */
+  q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+  q31_t *pStateCurnt;                            /* Points to the current sample of the state */
+  q31_t *px, *pb;                                /* Temporary pointers for state and coefficient buffers */
+  q31_t mu = S->mu;                              /* Adaptive factor */
+  uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+  uint32_t tapCnt, blkCnt;                       /* Loop counters */
+  q63_t energy;                                  /* Energy of the input */
+  q63_t acc;                                     /* Accumulator */
+  q31_t e = 0, d = 0;                            /* error, reference data sample */
+  q31_t w = 0, in;                               /* weight factor and state */
+  q31_t x0;                                      /* temporary variable to hold input sample */   
+  q63_t errorXmu;                   				 /* Temporary variables to store error and mu product and reciprocal of energy */
+  q31_t coef;                                    /* Temporary variable for coef */
+  q31_t acc_l, acc_h;                            /*  temporary input */
+  uint32_t uShift = ((uint32_t) S->postShift + 1U);
+  uint32_t lShift = 32U - uShift;                /*  Shift to be applied to the output */
+
+  energy = S->energy;
+  x0 = S->x0;
+
+  /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+  for(blkCnt = blockSize; blkCnt > 0U; blkCnt--)
+  {
+
+    /* Copy the new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc;
+
+    /* Initialize pState pointer */
+    px = pState;
+
+    /* Initialize pCoeffs pointer */
+    pb = pCoeffs;
+
+    /* Read the sample from input buffer */
+    in = *pSrc++;
+
+    /* Update the energy calculation */
+    energy = (q31_t)((((q63_t)energy << 32) - (((q63_t)x0 * x0) << 1)) >> 32) & 0xffffffff;
+    energy = (q31_t)(((((q63_t)in * in) << 1) + ((q63_t)energy << 32)) >> 32) & 0xffffffff;
+
+    /* Set the accumulator to zero */
+    acc = 0;
+
+    /* Loop over numTaps number of values */
+    tapCnt = numTaps;
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      acc += ((q63_t) (*px++)) * (*pb++);
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+
+    /* Converting the result to 1.31 format */
+    /* Calc lower part of acc */
+    acc_l = acc & 0xffffffff;
+
+    /* Calc upper part of acc */
+    acc_h = (acc >> 32) & 0xffffffff;
+
+    acc = (uint32_t)acc_l >> lShift | acc_h << uShift;
+
+    /* Store the result from accumulator into the destination buffer. */
+    *pOut++ = (q31_t)acc;
+
+    /* Compute and store error */
+    d = *pRef++;
+    e = d - (q31_t)acc;
+    *pErr++ = e;
+
+    /* Calculation of product of (e * mu) */
+    errorXmu = (q63_t)e * mu;
+
+    /* Weighting factor for the normalized version */
+    w = ref_sat_q31(errorXmu / (energy + DELTA_Q31));
+
+    /* Initialize pState pointer */
+    px = pState;
+
+    /* Initialize coeff pointer */
+    pb = pCoeffs;
+
+    /* Loop over numTaps number of values */
+    tapCnt = numTaps;
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      /* coef is in 2.30 format */
+      coef = (q31_t)(((q63_t)w * (*px++)) >> 32);
+      /* get coef in 1.31 format by left shifting */
+      *pb = ref_sat_q31((q63_t)*pb + (coef << 1U));
+      /* update coefficient buffer to next coefficient */
+      pb++;
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+
+    /* Read the sample from state buffer */
+    x0 = *pState;
+
+    /* Advance state pointer by 1 for the next sample */
+    pState++;
+  }
+
+  /* Save energy and x0 values for the next frame */
+  S->energy = (q31_t)energy;
+  S->x0 = x0;
+
+  /* Processing is complete. Now copy the last numTaps - 1 samples to the     
+     start of the state buffer. This prepares the state buffer for the        
+     next function call. */
+
+  /* Points to the start of the pState buffer */
+  pStateCurnt = S->pState;
+
+  /* Loop for (numTaps - 1U) samples copy */
+  tapCnt = numTaps - 1;
+
+  /* Copy the remaining q31_t data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    tapCnt--;
+  }
+}
+
+void ref_lms_q15(
+  const arm_lms_instance_q15 * S,
+  q15_t * pSrc,
+  q15_t * pRef,
+  q15_t * pOut,
+  q15_t * pErr,
+  uint32_t blockSize)
+{
+  q15_t *pState = S->pState;                     /* State pointer */
+  uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+  q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+  q15_t *pStateCurnt;                            /* Points to the current sample of the state */
+  q15_t mu = S->mu;                              /* Adaptive factor */
+  q15_t *px;                                     /* Temporary pointer for state */
+  q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
+  uint32_t tapCnt, blkCnt;                       /* Loop counters */
+  q63_t acc;                                     /* Accumulator */
+  q15_t e = 0;                                   /* error of data sample */
+  q15_t alpha;                                   /* Intermediate constant for taps update */
+  q31_t coef;                                    /* Teporary variable for coefficient */
+  q31_t acc_l, acc_h;
+  int32_t lShift = 15 - (int32_t)S->postShift;   /*  Post shift  */
+  int32_t uShift = 32 - lShift;
+
+  /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+  for(blkCnt = blockSize; blkCnt > 0U; blkCnt--)
+  {
+    /* Copy the new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc++;
+
+    /* Initialize pState pointer */
+    px = pState;
+
+    /* Initialize pCoeffs pointer */
+    pb = pCoeffs;
+
+    /* Set the accumulator to zero */
+    acc = 0;
+
+    /* Loop over numTaps number of values */
+    tapCnt = numTaps;
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      acc += (q63_t)((q31_t)(*px++) * (*pb++));
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+
+    /* Calc lower part of acc */
+    acc_l = acc & 0xffffffff;
+
+    /* Calc upper part of acc */
+    acc_h = (acc >> 32) & 0xffffffff;
+
+    /* Apply shift for lower part of acc and upper part of acc */
+    acc = (uint32_t)acc_l >> lShift | acc_h << uShift;
+
+    /* Converting the result to 1.15 format and saturate the output */
+    acc = ref_sat_q15(acc);
+
+    /* Store the result from accumulator into the destination buffer. */
+    *pOut++ = (q15_t)acc;
+
+    /* Compute and store error */
+    e = *pRef++ - (q15_t)acc;
+
+    *pErr++ = (q15_t)e;
+
+    /* Compute alpha i.e. intermediate constant for taps update */
+    alpha = (q15_t)(((q31_t)e * mu) >> 15);
+
+    /* Initialize pState pointer */
+    /* Advance state pointer by 1 for the next sample */
+    px = pState++;
+
+    /* Initialize pCoeffs pointer */
+    pb = pCoeffs;
+
+    /* Loop over numTaps number of values */
+    tapCnt = numTaps;
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15);
+      *pb++ = (q15_t) ref_sat_q15(coef);
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+  }
+
+  /* Processing is complete. Now copy the last numTaps - 1 samples to the        
+     start of the state buffer. This prepares the state buffer for the   
+     next function call. */
+
+  /* Points to the start of the pState buffer */
+  pStateCurnt = S->pState;
+
+  /*  Copy (numTaps - 1U) samples  */
+  tapCnt = numTaps - 1;
+
+  /* Copy the data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    tapCnt--;
+  }
+}
+
+void ref_lms_norm_q15(
+  arm_lms_norm_instance_q15 * S,
+  q15_t * pSrc,
+  q15_t * pRef,
+  q15_t * pOut,
+  q15_t * pErr,
+  uint32_t blockSize)
+{
+  q15_t *pState = S->pState;                     /* State pointer */
+  q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
+  q15_t *pStateCurnt;                            /* Points to the current sample of the state */
+  q15_t *px, *pb;                                /* Temporary pointers for state and coefficient buffers */
+  q15_t mu = S->mu;                              /* Adaptive factor */
+  uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+  uint32_t tapCnt, blkCnt;                       /* Loop counters */
+  q31_t energy;                                  /* Energy of the input */
+  q63_t acc;                                     /* Accumulator */
+  q15_t e = 0, d = 0;                            /* error, reference data sample */
+  q15_t w = 0, in;                               /* weight factor and state */
+  q15_t x0;                                      /* temporary variable to hold input sample */
+  q15_t errorXmu, oneByEnergy;                   /* Temporary variables to store error and mu product and reciprocal of energy */
+  //q31_t errorXmu;                   				 /* Temporary variables to store error and mu product and reciprocal of energy */
+  q15_t postShift;                               /* Post shift to be applied to weight after reciprocal calculation */
+  q31_t coef;                                    /* Teporary variable for coefficient */
+  q31_t acc_l, acc_h;
+  int32_t lShift = 15 - (int32_t)S->postShift;  /*  Post shift  */
+  int32_t uShift = 32 - lShift;
+
+  energy = S->energy;
+  x0 = S->x0;
+
+  /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+  for(blkCnt = blockSize; blkCnt > 0U; blkCnt--)
+  {
+    /* Copy the new input sample into the state buffer */
+    *pStateCurnt++ = *pSrc;
+
+    /* Initialize pState pointer */
+    px = pState;
+
+    /* Initialize pCoeffs pointer */
+    pb = pCoeffs;
+
+    /* Read the sample from input buffer */
+    in = *pSrc++;
+
+    /* Update the energy calculation */
+    energy -= (((q31_t)x0 * x0) >> 15) & 0xffff;
+    energy += (((q31_t)in * in) >> 15) & 0xffff;
+
+    /* Set the accumulator to zero */
+    acc = 0;
+
+    /* Loop over numTaps number of values */
+    tapCnt = numTaps;
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      acc += (q31_t)*px++ * (*pb++);
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+
+    /* Calc lower part of acc */
+    acc_l = acc & 0xffffffff;
+
+    /* Calc upper part of acc */
+    acc_h = (acc >> 32) & 0xffffffff;
+
+    /* Apply shift for lower part of acc and upper part of acc */
+    acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+    /* Converting the result to 1.15 format and saturate the output */
+    acc = ref_sat_q15(acc);
+
+    /* Store the result from accumulator into the destination buffer. */
+    *pOut++ = (q15_t) acc;
+
+    /* Compute and store error */
+    d = *pRef++;
+    e = d - (q15_t) acc;
+    *pErr++ = e;
+    
+#if 0
+    /* Calculation of e * mu value */
+    errorXmu = (q31_t) e * mu;
+
+    /* Calculation of (e * mu) /energy value */
+    acc = errorXmu / (energy + DELTA_Q15);
+#endif
+
+    /* Calculation of 1/energy */
+    postShift = arm_recip_q15((q15_t) energy + DELTA_Q15,
+                              &oneByEnergy, S->recipTable);
+
+    /* Calculation of e * mu value */
+    errorXmu = (q15_t) (((q31_t) e * mu) >> 15);
+
+    /* Calculation of (e * mu) * (1/energy) value */
+    acc = (((q31_t) errorXmu * oneByEnergy) >> (15 - postShift));
+    
+    /* Weighting factor for the normalized version */
+    w = ref_sat_q15((q31_t)acc);
+
+    /* Initialize pState pointer */
+    px = pState;
+
+    /* Initialize coeff pointer */
+    pb = pCoeffs;
+
+    /* Loop over numTaps number of values */
+    tapCnt = numTaps;
+
+    while (tapCnt > 0U)
+    {
+      /* Perform the multiply-accumulate */
+      coef = *pb + (((q31_t)w * (*px++)) >> 15);
+      *pb++ = ref_sat_q15(coef);
+
+      /* Decrement the loop counter */
+      tapCnt--;
+    }
+
+    /* Read the sample from state buffer */
+    x0 = *pState;
+
+    /* Advance state pointer by 1 for the next sample */
+    pState = pState + 1U;
+  }
+
+  /* Save energy and x0 values for the next frame */
+  S->energy = (q15_t)energy;
+  S->x0 = x0;
+
+  /* Processing is complete. Now copy the last numTaps - 1 samples to the        
+     satrt of the state buffer. This prepares the state buffer for the        
+     next function call. */
+
+  /* Points to the start of the pState buffer */
+  pStateCurnt = S->pState;
+
+  /* copy (numTaps - 1U) data */
+  tapCnt = numTaps - 1;
+
+  /* copy data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement the loop counter */
+    tapCnt--;
+  }
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/HelperFunctions/HelperFunctions.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/HelperFunctions/HelperFunctions.c
new file mode 100644
index 0000000..557e969
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/HelperFunctions/HelperFunctions.c
@@ -0,0 +1,4 @@
+
+#include "mat_helper.c"
+#include "ref_helper.c"
+
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/HelperFunctions/mat_helper.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/HelperFunctions/mat_helper.c
new file mode 100644
index 0000000..0174ccf
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/HelperFunctions/mat_helper.c
@@ -0,0 +1,193 @@
+#include "ref.h"
+
+float32_t ref_detrm(float32_t *pSrc, float32_t *temp, uint32_t size)
+{
+   float32_t s = 1, det = 0;
+   int i, j, m, n, c;
+
+   if ( size == 1 )
+   {
+      return ( pSrc[ 0 ] );
+   }
+   else
+   {
+      det = 0;
+
+      for ( c = 0;c < size;c++ )
+      {
+         m = 0;
+         n = 0;
+
+         for ( i = 0;i < size;i++ )
+         {
+            for ( j = 0;j < size;j++ )
+            {
+               temp[ i*size + j ] = 0;
+
+               if ( i != 0 && j != c )
+               {
+                  temp[ m*(size-1) + n ] = pSrc[ i*size + j ];
+
+                  if ( n < ( size - 2 ) )
+                  {
+                     n++;
+                  }
+                  else
+                  {
+                     n = 0;
+                     m++;
+                  }
+               }
+            }
+         }
+
+         det += s * ( pSrc[ c ] * ref_detrm( temp, temp + size*size, size - 1 ) );
+         s = -s;
+      }
+   }
+   
+   return ( det );
+}
+
+
+void ref_cofact(float32_t *pSrc, float32_t *pDst, float32_t *temp, uint32_t size)
+{
+   int p, q, m, n, i, j;
+	
+	 if (size == 1)
+	 {
+		 pDst[0] = 1;
+		 return;
+	 }
+
+   for ( q = 0;q < size;q++ )
+   {
+      for ( p = 0;p < size;p++ )
+      {
+         m = 0;
+         n = 0;
+
+         for ( i = 0;i < size;i++ )
+         {
+            for ( j = 0;j < size;j++ )
+            {
+               temp[ i*size + j ] = 0;
+
+               if ( i != q && j != p )
+               {
+                  temp[ m*(size-1) + n ] = pSrc[ i*size + j ];
+
+                  if ( n < ( size - 2 ) )
+									{
+										n++;
+									}
+                  else
+                  {
+                     n = 0;
+                     m++;
+                  }
+               }
+            }
+         }
+
+         pDst[ q*size + p ] = ref_pow( -1, q + p ) * ref_detrm( temp, temp + (size-1)*(size-1), size - 1 );
+      }
+   }
+}
+
+
+
+float64_t ref_detrm64(float64_t *pSrc, float64_t *temp, uint32_t size)
+{
+   float64_t s = 1, det = 0;
+   int i, j, m, n, c;
+
+   if ( size == 1 )
+   {
+      return ( pSrc[ 0 ] );
+   }
+   else
+   {
+      det = 0;
+
+      for ( c = 0;c < size;c++ )
+      {
+         m = 0;
+         n = 0;
+
+         for ( i = 0;i < size;i++ )
+         {
+            for ( j = 0;j < size;j++ )
+            {
+               temp[ i*size + j ] = 0;
+
+               if ( i != 0 && j != c )
+               {
+                  temp[ m*(size-1) + n ] = pSrc[ i*size + j ];
+
+                  if ( n < ( size - 2 ) )
+                  {
+                     n++;
+                  }
+                  else
+                  {
+                     n = 0;
+                     m++;
+                  }
+               }
+            }
+         }
+
+         det += s * ( pSrc[ c ] * ref_detrm64( temp, temp + size*size, size - 1 ) );
+         s = -s;
+      }
+   }
+   
+   return ( det );
+}
+
+
+void ref_cofact64(float64_t *pSrc, float64_t *pDst, float64_t *temp, uint32_t size)
+{
+   int p, q, m, n, i, j;
+	
+	 if (size == 1)
+	 {
+		 pDst[0] = 1;
+		 return;
+	 }
+
+   for ( q = 0;q < size;q++ )
+   {
+      for ( p = 0;p < size;p++ )
+      {
+         m = 0;
+         n = 0;
+
+         for ( i = 0;i < size;i++ )
+         {
+            for ( j = 0;j < size;j++ )
+            {
+               temp[ i*size + j ] = 0;
+
+               if ( i != q && j != p )
+               {
+                  temp[ m*(size-1) + n ] = pSrc[ i*size + j ];
+
+                  if ( n < ( size - 2 ) )
+									{
+										n++;
+									}
+                  else
+                  {
+                     n = 0;
+                     m++;
+                  }
+               }
+            }
+         }
+
+         pDst[ q*size + p ] = ref_pow( -1, q + p ) * ref_detrm64( temp, temp + (size-1)*(size-1), size - 1 );
+      }
+   }
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/HelperFunctions/ref_helper.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/HelperFunctions/ref_helper.c
new file mode 100644
index 0000000..57ecf1b
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/HelperFunctions/ref_helper.c
@@ -0,0 +1,103 @@
+#include "ref.h"
+
+float32_t scratchArray[8192*2]; 
+
+arm_cfft_instance_f32 ref_cfft_sR_f32_len8192 = { 8192, 0, 0, 0 };
+	
+q31_t ref_sat_n(q31_t num, uint32_t bits)
+{
+	int32_t posMax, negMin;
+	uint32_t i;
+
+	posMax = 1;
+	for (i = 0; i < (bits - 1); i++)
+	{
+		posMax = posMax * 2;
+	}
+
+	if (num > 0)
+	{
+		posMax = (posMax - 1);
+
+		if (num > posMax)
+		{
+			num = posMax;
+		}
+	}
+	else
+	{
+		negMin = -posMax;
+
+		if (num < negMin)
+		{
+			num = negMin;
+		}
+	}
+	return (num);
+}
+
+q31_t ref_sat_q31(q63_t num)
+{
+	if (num > (q63_t)INT_MAX)
+	{
+		return INT_MAX;
+	}
+	else if (num < (q63_t)0xffffffff80000000ll)
+	{
+		return INT_MIN;
+	}
+	else
+	{
+		return (q31_t)num;
+	}
+}
+
+q15_t ref_sat_q15(q31_t num)
+{
+	if (num > (q31_t)SHRT_MAX)
+	{
+		return SHRT_MAX;
+	}
+	else if (num < (q31_t)0xffff8000)
+	{
+		return SHRT_MIN;
+	}
+	else
+	{
+		return (q15_t)num;
+	}
+}
+
+q7_t ref_sat_q7(q15_t num)
+{
+	if (num > (q15_t)SCHAR_MAX)
+	{
+		return SCHAR_MAX;
+	}
+	else if (num < (q15_t)0xff80)
+	{
+		return SCHAR_MIN;
+	}
+	else
+	{
+		return (q7_t)num;
+	}
+}
+
+float32_t ref_pow(float32_t a, uint32_t b)
+{
+	uint32_t i;
+	float32_t r = a;
+	
+	for(i=1;i<b;i++) 
+	{
+		r *= a;
+	}
+	
+	if ( b == 0)
+	{
+		return 1;
+	}
+	
+	return r;
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/Intrinsics/Intrinsics_.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/Intrinsics/Intrinsics_.c
new file mode 100644
index 0000000..1536829
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/Intrinsics/Intrinsics_.c
@@ -0,0 +1,3 @@
+
+#include "intrinsics.c"
+
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/Intrinsics/intrinsics.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/Intrinsics/intrinsics.c
new file mode 100644
index 0000000..7412adb
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/Intrinsics/intrinsics.c
@@ -0,0 +1,238 @@
+#include "ref.h"
+
+q31_t ref__QADD8(q31_t x, q31_t y)
+{
+   q31_t sum;
+   q7_t r, s, t, u;
+
+   r = (q7_t) x;
+   s = (q7_t) y;
+
+   r = ref_sat_n((q31_t) (r + s), 8);
+   s = ref_sat_n(((q31_t) (((x << 16) >> 24) + ((y << 16) >> 24))), 8);
+   t = ref_sat_n(((q31_t) (((x << 8) >> 24) + ((y << 8) >> 24))), 8);
+   u = ref_sat_n(((q31_t) ((x >> 24) + (y >> 24))), 8);
+
+   sum =
+      (((q31_t) u << 24) & 0xFF000000) | (((q31_t) t << 16) & 0x00FF0000) |
+      (((q31_t) s << 8) & 0x0000FF00) | (r & 0x000000FF);
+
+   return sum;
+
+}
+
+q31_t ref__QSUB8(q31_t x, q31_t y)
+{
+   q31_t sum;
+   q31_t r, s, t, u;
+
+   r = (q7_t) x;
+   s = (q7_t) y;
+
+   r = ref_sat_n((r - s), 8);
+   s = ref_sat_n(((q31_t) (((x << 16) >> 24) - ((y << 16) >> 24))), 8) << 8;
+   t = ref_sat_n(((q31_t) (((x << 8) >> 24) - ((y << 8) >> 24))), 8) << 16;
+   u = ref_sat_n(((q31_t) ((x >> 24) - (y >> 24))), 8) << 24;
+
+   sum = (u & 0xFF000000) | (t & 0x00FF0000) | (s & 0x0000FF00) | (r & 0x000000FF);
+
+   return sum;
+}
+
+q31_t ref__QADD16(q31_t x, q31_t y)
+{
+   q31_t sum;
+   q31_t r, s;
+
+   r = (q15_t) x;
+   s = (q15_t) y;
+
+   r = ref_sat_q15(r + s);
+   s = (q31_t)ref_sat_q15(((q31_t) ((x >> 16) + (y >> 16)))) << 16;
+
+   sum = (s & 0xFFFF0000) | (r & 0x0000FFFF);
+
+   return sum;
+
+}
+
+q31_t ref__SHADD16(q31_t x, q31_t y)
+{
+   q31_t sum;
+   q31_t r, s;
+
+   r = (q15_t) x;
+   s = (q15_t) y;
+
+   r = (r + s) >> 1;
+   s = ((q31_t) (((x >> 16) + (y >> 16)) >> 1) << 16);
+
+   sum = (s & 0xFFFF0000) | (r & 0x0000FFFF);
+
+   return sum;
+
+}
+
+q31_t ref__QSUB16(q31_t x, q31_t y)
+{
+   q31_t sum;
+   q31_t r, s;
+
+   r = (q15_t) x;
+   s = (q15_t) y;
+
+   r = ref_sat_q15(r - s);
+   s = (q31_t)ref_sat_q15(((q31_t) ((x >> 16) - (y >> 16)))) << 16;
+
+   sum = (s & 0xFFFF0000) | (r & 0x0000FFFF);
+
+   return sum;
+}
+
+q31_t ref__SHSUB16(q31_t x, q31_t y)
+{
+   q31_t diff;
+   q31_t r, s;
+
+   r = (q15_t) x;
+   s = (q15_t) y;
+
+   r = ((r >> 1) - (s >> 1));
+   s = (((x >> 17) - (y >> 17)) << 16);
+
+   diff = (s & 0xFFFF0000) | (r & 0x0000FFFF);
+
+   return diff;
+}
+
+q31_t ref__QASX(q31_t x, q31_t y)
+{
+   q31_t sum = 0;
+	q31_t xL, xH, yL, yH;
+	
+   // extract bottom halfword and sign extend
+	xL = (q15_t)(x & 0xffff);
+   // extract bottom halfword and sign extend
+	yL = (q15_t)(y & 0xffff);
+   // extract top halfword and sign extend
+   xH = (q15_t)(x >> 16);
+   // extract top halfword and sign extend
+   yH = (q15_t)(y >> 16);
+   
+   sum = (((q31_t)ref_sat_q15(xH + yL )) << 16) |
+         (((q31_t)ref_sat_q15(xL - yH )) & 0xffff);
+
+   return sum;
+}
+
+q31_t ref__SHASX(q31_t x, q31_t y)
+{
+   q31_t sum;
+   q31_t r, s;
+
+   r = (q15_t) x;
+   s = (q15_t) y;
+
+    r = (r - (y >> 16)) / 2;
+    s = (((x >> 16) + s) << 15);
+
+   sum = (s & 0xFFFF0000) | (r & 0x0000FFFF);
+
+   return sum;
+}
+
+q31_t ref__QSAX(q31_t x, q31_t y)
+{
+   q31_t sum = 0;
+	q31_t xL, xH, yL, yH;
+	
+   // extract bottom halfword and sign extend
+	xL = (q15_t)(x & 0xffff);
+   // extract bottom halfword and sign extend
+	yL = (q15_t)(y & 0xffff);
+   // extract top halfword and sign extend
+   xH = (q15_t)(x >> 16);
+   // extract top halfword and sign extend
+   yH = (q15_t)(y >> 16);
+   
+   sum = (((q31_t)ref_sat_q15(xH - yL )) << 16) |
+         (((q31_t)ref_sat_q15(xL + yH )) & 0xffff);
+
+   return sum;
+}
+
+q31_t ref__SHSAX(q31_t x, q31_t y)
+{
+   q31_t sum;
+   q31_t r, s;
+
+   r = (q15_t) x;
+   s = (q15_t) y;
+
+    r = (r + (y >> 16)) / 2;
+    s = (((x >> 16) - s) << 15);
+
+   sum = (s & 0xFFFF0000) | (r & 0x0000FFFF);
+
+   return sum;
+}
+
+q31_t ref__SMUSDX(q31_t x, q31_t y)
+{
+   return ((q31_t) (((q15_t) x * (q15_t) (y >> 16)) - ((q15_t) (x >> 16) * (q15_t) y)));
+}
+
+q31_t ref__SMUADX(q31_t x, q31_t y)
+{
+   return ((q31_t) (((q15_t) x * (q15_t) (y >> 16)) + ((q15_t) (x >> 16) * (q15_t) y)));
+}
+
+q31_t ref__QADD(q31_t x, q31_t y)
+{
+   return ref_sat_q31((q63_t) x + y);
+}
+
+q31_t ref__QSUB(q31_t x, q31_t y)
+{
+   return ref_sat_q31((q63_t) x - y);
+}
+
+q31_t ref__SMLAD(q31_t x, q31_t y, q31_t sum)
+{
+   return (sum + ((q15_t) (x >> 16) * (q15_t) (y >> 16)) + ((q15_t) x * (q15_t) y));
+}
+
+q31_t ref__SMLADX(q31_t x, q31_t y, q31_t sum)
+{
+   return (sum + ((q15_t) (x >> 16) * (q15_t) (y)) + ((q15_t) x * (q15_t) (y >> 16)));
+}
+
+q31_t ref__SMLSDX(q31_t x, q31_t y, q31_t sum)
+{
+   return (sum - ((q15_t) (x >> 16) * (q15_t) (y)) + ((q15_t) x * (q15_t) (y >> 16)));
+}
+
+q63_t ref__SMLALD(q31_t x, q31_t y, q63_t sum)
+{
+   return (sum + ((q15_t) (x >> 16) * (q15_t) (y >> 16)) + ((q15_t) x * (q15_t) y));
+}
+
+q63_t ref__SMLALDX(q31_t x, q31_t y, q63_t sum)
+{
+   return (sum + ((q15_t) (x >> 16) * (q15_t) y)) + ((q15_t) x * (q15_t) (y >> 16));
+}
+
+q31_t ref__SMUAD(q31_t x, q31_t y)
+{
+   return (((x >> 16) * (y >> 16)) + (((x << 16) >> 16) * ((y << 16) >> 16)));
+}
+
+q31_t ref__SMUSD(q31_t x, q31_t y)
+{
+   return (-((x >> 16) * (y >> 16)) + (((x << 16) >> 16) * ((y << 16) >> 16)));
+}
+
+q31_t ref__SXTB16(q31_t x)
+{
+   return ((((x << 24) >> 24) & 0x0000FFFF) | (((x << 8) >> 8) & 0xFFFF0000));
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/MatrixFunctions.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/MatrixFunctions.c
new file mode 100644
index 0000000..b285455
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/MatrixFunctions.c
@@ -0,0 +1,9 @@
+
+#include "mat_add.c"
+#include "mat_cmplx_mult.c"
+#include "mat_inverse.c"
+#include "mat_mult.c"
+#include "mat_scale.c"
+#include "mat_sub.c"
+#include "mat_trans.c"
+
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_add.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_add.c
new file mode 100644
index 0000000..a6e0067
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_add.c
@@ -0,0 +1,58 @@
+#include "ref.h"
+
+arm_status ref_mat_add_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+  arm_matrix_instance_f32 * pDst)
+{
+	uint32_t i;
+  uint32_t numSamples;                           /* total number of elements in the matrix  */
+
+	/* Total number of samples in the input matrix */
+	numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+	
+	for(i=0;i<numSamples;i++)
+	{
+		pDst->pData[i] = pSrcA->pData[i] + pSrcB->pData[i];
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_mat_add_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+  arm_matrix_instance_q31 * pDst)
+{
+	uint32_t i;
+  uint32_t numSamples;                           /* total number of elements in the matrix  */
+
+	/* Total number of samples in the input matrix */
+	numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+	
+	for(i=0;i<numSamples;i++)
+	{
+		pDst->pData[i] = ref_sat_q31( (q63_t)pSrcA->pData[i] + pSrcB->pData[i]);
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_mat_add_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+  arm_matrix_instance_q15 * pDst)
+{
+	uint32_t i;
+  uint32_t numSamples;                           /* total number of elements in the matrix  */
+
+	/* Total number of samples in the input matrix */
+	numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+	
+	for(i=0;i<numSamples;i++)
+	{
+		pDst->pData[i] = ref_sat_q15( (q31_t)pSrcA->pData[i] + pSrcB->pData[i]);
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_cmplx_mult.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_cmplx_mult.c
new file mode 100644
index 0000000..9364619
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_cmplx_mult.c
@@ -0,0 +1,118 @@
+#include "ref.h"
+
+arm_status ref_mat_cmplx_mult_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+  arm_matrix_instance_f32 * pDst)
+{
+	uint32_t r,c,i,outR,outC,innerSize;
+	float32_t sumR,sumI;
+	float32_t a0,b0,c0,d0;
+	
+	outR = pSrcA->numRows;
+	outC = pSrcB->numCols;
+	innerSize = pSrcA->numCols;
+	
+	for(r=0;r<outR;r++)
+	{
+		for(c=0;c<outC;c++)
+		{
+			sumR = 0;
+			sumI = 0;
+			
+			for(i=0;i<innerSize;i++)
+			{
+				a0 = pSrcA->pData[2*(r*innerSize + i) + 0];
+				b0 = pSrcA->pData[2*(r*innerSize + i) + 1];
+				c0 = pSrcB->pData[2*(i*outC + c) + 0]; 
+				d0 = pSrcB->pData[2*(i*outC + c) + 1];	
+				
+				sumR += a0 * c0 - b0 * d0;
+				sumI += b0 * c0 + a0 * d0;
+			}
+			
+			pDst->pData[2*(r*outC + c) + 0] = sumR;
+			pDst->pData[2*(r*outC + c) + 1] = sumI;
+		}
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_mat_cmplx_mult_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+  arm_matrix_instance_q31 * pDst)
+{
+	uint32_t r,c,i,outR,outC,innerSize;
+	q63_t sumR,sumI;
+	q31_t a0,b0,c0,d0;
+	
+	outR = pSrcA->numRows;
+	outC = pSrcB->numCols;
+	innerSize = pSrcA->numCols;
+	
+	for(r=0;r<outR;r++)
+	{
+		for(c=0;c<outC;c++)
+		{
+			sumR = 0;
+			sumI = 0;
+			
+			for(i=0;i<innerSize;i++)
+			{
+				a0 = pSrcA->pData[2*(r*innerSize + i) + 0];
+				b0 = pSrcA->pData[2*(r*innerSize + i) + 1];
+				c0 = pSrcB->pData[2*(i*outC + c) + 0]; 
+				d0 = pSrcB->pData[2*(i*outC + c) + 1];	
+				
+				sumR += (q63_t)a0 * c0 - (q63_t)b0 * d0;
+				sumI += (q63_t)b0 * c0 + (q63_t)a0 * d0;
+			}
+			
+			pDst->pData[2*(r*outC + c) + 0] = ref_sat_q31(sumR >> 31);
+			pDst->pData[2*(r*outC + c) + 1] = ref_sat_q31(sumI >> 31);
+		}
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_mat_cmplx_mult_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+  arm_matrix_instance_q15 * pDst)
+{
+	uint32_t r,c,i,outR,outC,innerSize;
+	q63_t sumR,sumI;
+	q15_t a0,b0,c0,d0;
+	
+	outR = pSrcA->numRows;
+	outC = pSrcB->numCols;
+	innerSize = pSrcA->numCols;
+	
+	for(r=0;r<outR;r++)
+	{
+		for(c=0;c<outC;c++)
+		{
+			sumR = 0;
+			sumI = 0;
+			
+			for(i=0;i<innerSize;i++)
+			{
+				a0 = pSrcA->pData[2*(r*innerSize + i) + 0];
+				b0 = pSrcA->pData[2*(r*innerSize + i) + 1];
+				c0 = pSrcB->pData[2*(i*outC + c) + 0]; 
+				d0 = pSrcB->pData[2*(i*outC + c) + 1];	
+				
+				sumR += (q31_t)a0 * c0 - (q31_t)b0 * d0;
+				sumI += (q31_t)b0 * c0 + (q31_t)a0 * d0;
+			}
+			
+			pDst->pData[2*(r*outC + c) + 0] = ref_sat_q15(sumR >> 15);
+			pDst->pData[2*(r*outC + c) + 1] = ref_sat_q15(sumI >> 15);
+		}
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_inverse.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_inverse.c
new file mode 100644
index 0000000..74d3ccc
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_inverse.c
@@ -0,0 +1,57 @@
+#include "ref.h"
+
+arm_status ref_mat_inverse_f32(
+  const arm_matrix_instance_f32 * pSrc,
+  arm_matrix_instance_f32 * pDst)
+{
+	float32_t det;
+	uint32_t i, size;
+	arm_matrix_instance_f32 tmp;
+	
+	tmp.numCols = pSrc->numCols;
+	tmp.numRows = pSrc->numRows;
+	tmp.pData = scratchArray;
+	
+	det = ref_detrm(pSrc->pData,scratchArray,pSrc->numCols);
+	
+	size = pSrc->numCols * pSrc->numCols;
+	
+	ref_cofact(pSrc->pData,scratchArray,scratchArray + size,pSrc->numCols);
+	
+	ref_mat_trans_f32(&tmp,pDst);	
+	
+	for(i=0;i<size;i++)
+	{
+		pDst->pData[i] /= det;
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_mat_inverse_f64(
+  const arm_matrix_instance_f64 * pSrc,
+  arm_matrix_instance_f64 * pDst)
+{
+	float64_t det;
+	uint32_t i, size;
+	arm_matrix_instance_f64 tmp;
+	
+	tmp.numCols = pSrc->numCols;
+	tmp.numRows = pSrc->numRows;
+	tmp.pData = (float64_t*)scratchArray;
+	
+	det = ref_detrm64(pSrc->pData,(float64_t*)scratchArray,pSrc->numCols);
+	
+	size = pSrc->numCols * pSrc->numCols;
+	
+	ref_cofact64(pSrc->pData,(float64_t*)scratchArray,(float64_t*)scratchArray + size,pSrc->numCols);
+	
+	ref_mat_trans_f64(&tmp,pDst);	
+	
+	for(i=0;i<size;i++)
+	{
+		pDst->pData[i] /= det;
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_mult.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_mult.c
new file mode 100644
index 0000000..e9ef432
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_mult.c
@@ -0,0 +1,91 @@
+#include "ref.h"
+
+arm_status ref_mat_mult_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+  arm_matrix_instance_f32 * pDst)
+{
+	uint32_t r,c,i,outR,outC,innerSize;
+	float32_t sum;
+	
+	outR = pSrcA->numRows;
+	outC = pSrcB->numCols;
+	innerSize = pSrcA->numCols;
+	
+	for(r=0;r<outR;r++)
+	{
+		for(c=0;c<outC;c++)
+		{
+			sum = 0;
+			
+			for(i=0;i<innerSize;i++)
+			{
+				sum += pSrcA->pData[r*innerSize + i] * pSrcB->pData[i*outC + c];
+			}
+			
+			pDst->pData[r*outC + c] = sum;
+		}
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_mat_mult_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+  arm_matrix_instance_q31 * pDst)
+{
+	uint32_t r,c,i,outR,outC,innerSize;
+	q63_t sum;
+	
+	outR = pSrcA->numRows;
+	outC = pSrcB->numCols;
+	innerSize = pSrcA->numCols;
+	
+	for(r=0;r<outR;r++)
+	{
+		for(c=0;c<outC;c++)
+		{
+			sum = 0;
+			
+			for(i=0;i<innerSize;i++)
+			{
+				sum += (q63_t)(pSrcA->pData[r*innerSize + i]) * pSrcB->pData[i*outC + c];
+			}
+			
+			pDst->pData[r*outC + c] = ref_sat_q31(sum >> 31);
+		}
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_mat_mult_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+  arm_matrix_instance_q15 * pDst)
+{
+	uint32_t r,c,i,outR,outC,innerSize;
+	q63_t sum;
+	
+	outR = pSrcA->numRows;
+	outC = pSrcB->numCols;
+	innerSize = pSrcA->numCols;
+	
+	for(r=0;r<outR;r++)
+	{
+		for(c=0;c<outC;c++)
+		{
+			sum = 0;
+			
+			for(i=0;i<innerSize;i++)
+			{
+				sum += (q31_t)(pSrcA->pData[r*innerSize + i]) * pSrcB->pData[i*outC + c];
+			}
+			
+			pDst->pData[r*outC + c] = ref_sat_q15(sum >> 15);
+		}
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_scale.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_scale.c
new file mode 100644
index 0000000..d426ad6
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_scale.c
@@ -0,0 +1,64 @@
+#include "ref.h"
+
+arm_status ref_mat_scale_f32(
+  const arm_matrix_instance_f32 * pSrc,
+  float32_t scale,
+  arm_matrix_instance_f32 * pDst)
+{
+	uint32_t i;
+  uint32_t numSamples;                           /* total number of elements in the matrix  */
+
+	/* Total number of samples in the input matrix */
+	numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+	
+	for(i=0;i<numSamples;i++)
+	{
+		pDst->pData[i] = pSrc->pData[i] * scale;
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_mat_scale_q31(
+  const arm_matrix_instance_q31 * pSrc,
+  q31_t scale,
+  int32_t shift,
+  arm_matrix_instance_q31 * pDst)
+{
+	uint32_t i;
+  uint32_t numSamples;                           /* total number of elements in the matrix  */
+	int32_t totShift = shift + 1; 
+	q31_t tmp;
+
+	/* Total number of samples in the input matrix */
+	numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+	
+	for(i=0;i<numSamples;i++)
+	{
+		tmp = ((q63_t)pSrc->pData[i] * scale) >> 32;
+		pDst->pData[i] = ref_sat_q31((q63_t)tmp << totShift );
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_mat_scale_q15(
+  const arm_matrix_instance_q15 * pSrc,
+  q15_t scale,
+  int32_t shift,
+  arm_matrix_instance_q15 * pDst)
+{
+	uint32_t i;
+  uint32_t numSamples;                           /* total number of elements in the matrix  */
+	int32_t totShift = 15 - shift;
+
+	/* Total number of samples in the input matrix */
+	numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+	
+	for(i=0;i<numSamples;i++)
+	{
+		pDst->pData[i] = ref_sat_q15( ((q31_t)pSrc->pData[i] * scale) >> totShift);
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_sub.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_sub.c
new file mode 100644
index 0000000..bbd23f0
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_sub.c
@@ -0,0 +1,58 @@
+#include "ref.h"
+
+arm_status ref_mat_sub_f32(
+  const arm_matrix_instance_f32 * pSrcA,
+  const arm_matrix_instance_f32 * pSrcB,
+  arm_matrix_instance_f32 * pDst)
+{
+	uint32_t i;
+  uint32_t numSamples;                           /* total number of elements in the matrix  */
+
+	/* Total number of samples in the input matrix */
+	numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+	
+	for(i=0;i<numSamples;i++)
+	{
+		pDst->pData[i] = pSrcA->pData[i] - pSrcB->pData[i];
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_mat_sub_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+  arm_matrix_instance_q31 * pDst)
+{
+	uint32_t i;
+  uint32_t numSamples;                           /* total number of elements in the matrix  */
+
+	/* Total number of samples in the input matrix */
+	numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+	
+	for(i=0;i<numSamples;i++)
+	{
+		pDst->pData[i] = ref_sat_q31( (q63_t)pSrcA->pData[i] - pSrcB->pData[i]);
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_mat_sub_q15(
+  const arm_matrix_instance_q15 * pSrcA,
+  const arm_matrix_instance_q15 * pSrcB,
+  arm_matrix_instance_q15 * pDst)
+{
+	uint32_t i;
+  uint32_t numSamples;                           /* total number of elements in the matrix  */
+
+	/* Total number of samples in the input matrix */
+	numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+	
+	for(i=0;i<numSamples;i++)
+	{
+		pDst->pData[i] = ref_sat_q15( (q31_t)pSrcA->pData[i] - pSrcB->pData[i]);
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_trans.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_trans.c
new file mode 100644
index 0000000..8cb9a8d
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/MatrixFunctions/mat_trans.c
@@ -0,0 +1,77 @@
+#include "ref.h"
+
+arm_status ref_mat_trans_f64(
+  const arm_matrix_instance_f64 * pSrc,
+  arm_matrix_instance_f64 * pDst)
+{
+	uint64_t r,c;
+	uint64_t numR = pSrc->numRows;
+	uint64_t numC = pSrc->numCols;
+	
+	for(r=0;r<numR;r++)
+	{
+		for(c=0;c<numC;c++)
+		{
+			pDst->pData[c*numR + r] = pSrc->pData[r*numC + c];
+		}
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_mat_trans_f32(
+  const arm_matrix_instance_f32 * pSrc,
+  arm_matrix_instance_f32 * pDst)
+{
+	uint32_t r,c;
+	uint32_t numR = pSrc->numRows;
+	uint32_t numC = pSrc->numCols;
+	
+	for(r=0;r<numR;r++)
+	{
+		for(c=0;c<numC;c++)
+		{
+			pDst->pData[c*numR + r] = pSrc->pData[r*numC + c];
+		}
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_mat_trans_q31(
+  const arm_matrix_instance_q31 * pSrc,
+  arm_matrix_instance_q31 * pDst)
+{
+	uint32_t r,c;
+	uint32_t numR = pSrc->numRows;
+	uint32_t numC = pSrc->numCols;
+	
+	for(r=0;r<numR;r++)
+	{
+		for(c=0;c<numC;c++)
+		{
+			pDst->pData[c*numR + r] = pSrc->pData[r*numC + c];
+		}
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
+
+arm_status ref_mat_trans_q15(
+  const arm_matrix_instance_q15 * pSrc,
+  arm_matrix_instance_q15 * pDst)
+{
+	uint32_t r,c;
+	uint32_t numR = pSrc->numRows;
+	uint32_t numC = pSrc->numCols;
+	
+	for(r=0;r<numR;r++)
+	{
+		for(c=0;c<numC;c++)
+		{
+			pDst->pData[c*numR + r] = pSrc->pData[r*numC + c];
+		}
+	}
+	
+	return ARM_MATH_SUCCESS;
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/StatisticsFunctions.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/StatisticsFunctions.c
new file mode 100644
index 0000000..6791b3e
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/StatisticsFunctions.c
@@ -0,0 +1,8 @@
+
+#include "max.c"
+#include "mean.c"
+#include "min.c"
+#include "power.c"
+#include "rms.c"
+#include "std.c"
+#include "var.c"
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/max.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/max.c
new file mode 100644
index 0000000..02b4127
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/max.c
@@ -0,0 +1,85 @@
+#include "ref.h"
+
+void ref_max_f32(
+  float32_t * pSrc,
+  uint32_t blockSize,
+  float32_t * pResult,
+  uint32_t * pIndex)
+{
+	uint32_t i, ind=0;
+	float32_t max=-FLT_MAX;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		if (max < pSrc[i])
+		{
+			max = pSrc[i];
+			ind = i;
+		}
+	}
+	*pResult = max;
+	*pIndex = ind;
+}
+
+void ref_max_q31(
+  q31_t * pSrc,
+  uint32_t blockSize,
+  q31_t * pResult,
+  uint32_t * pIndex)
+{
+	uint32_t i, ind=0;
+	q31_t max=INT_MIN;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		if (max < pSrc[i])
+		{
+			max = pSrc[i];
+			ind = i;
+		}
+	}
+	*pResult = max;
+	*pIndex = ind;
+}
+
+void ref_max_q15(
+  q15_t * pSrc,
+  uint32_t blockSize,
+  q15_t * pResult,
+  uint32_t * pIndex)
+{
+	uint32_t i, ind=0;
+	q15_t max=SHRT_MIN;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		if (max < pSrc[i])
+		{
+			max = pSrc[i];
+			ind = i;
+		}
+	}
+	*pResult = max;
+	*pIndex = ind;
+}
+
+void ref_max_q7(
+  q7_t * pSrc,
+  uint32_t blockSize,
+  q7_t * pResult,
+  uint32_t * pIndex)
+{
+	uint32_t i, ind=0;
+	q7_t max=SCHAR_MIN;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		if (max < pSrc[i])
+		{
+			max = pSrc[i];
+			ind = i;
+		}
+	}
+	*pResult = max;
+	*pIndex = ind;
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/mean.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/mean.c
new file mode 100644
index 0000000..d4b4da4
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/mean.c
@@ -0,0 +1,61 @@
+#include "ref.h"
+
+void ref_mean_f32(
+  float32_t * pSrc,
+  uint32_t blockSize,
+  float32_t * pResult)
+{
+	uint32_t i;
+	float32_t sum=0;
+	
+	for(i=0;i<blockSize;i++)
+	{
+			sum += pSrc[i];
+	}
+	*pResult = sum / (float32_t)blockSize;
+}
+
+void ref_mean_q31(
+  q31_t * pSrc,
+  uint32_t blockSize,
+  q31_t * pResult)
+{
+	uint32_t i;
+	q63_t sum=0;
+	
+	for(i=0;i<blockSize;i++)
+	{
+			sum += pSrc[i];
+	}
+	*pResult = (q31_t) (sum / (int32_t) blockSize);
+}
+
+void ref_mean_q15(
+  q15_t * pSrc,
+  uint32_t blockSize,
+  q15_t * pResult)
+{
+	uint32_t i;
+	q31_t sum=0;
+	
+	for(i=0;i<blockSize;i++)
+	{
+			sum += pSrc[i];
+	}
+	*pResult = (q15_t) (sum / (int32_t) blockSize);
+}
+
+void ref_mean_q7(
+  q7_t * pSrc,
+  uint32_t blockSize,
+  q7_t * pResult)
+{
+	uint32_t i;
+	q31_t sum=0;
+	
+	for(i=0;i<blockSize;i++)
+	{
+			sum += pSrc[i];
+	}
+	*pResult = (q7_t) (sum / (int32_t) blockSize);
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/min.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/min.c
new file mode 100644
index 0000000..6072b1a
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/min.c
@@ -0,0 +1,85 @@
+#include "ref.h"
+
+void ref_min_f32(
+  float32_t * pSrc,
+  uint32_t blockSize,
+  float32_t * pResult,
+  uint32_t * pIndex)
+{
+	uint32_t i, ind=0;
+	float32_t min=FLT_MAX;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		if (min > pSrc[i])
+		{
+			min = pSrc[i];
+			ind = i;
+		}
+	}
+	*pResult = min;
+	*pIndex = ind;
+}
+
+void ref_min_q31(
+  q31_t * pSrc,
+  uint32_t blockSize,
+  q31_t * pResult,
+  uint32_t * pIndex)
+{
+	uint32_t i, ind=0;
+	q31_t min=INT_MAX;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		if (min > pSrc[i])
+		{
+			min = pSrc[i];
+			ind = i;
+		}
+	}
+	*pResult = min;
+	*pIndex = ind;
+}
+
+void ref_min_q15(
+  q15_t * pSrc,
+  uint32_t blockSize,
+  q15_t * pResult,
+  uint32_t * pIndex)
+{
+	uint32_t i, ind=0;
+	q15_t min=SHRT_MAX;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		if (min > pSrc[i])
+		{
+			min = pSrc[i];
+			ind = i;
+		}
+	}
+	*pResult = min;
+	*pIndex = ind;
+}
+
+void ref_min_q7(
+  q7_t * pSrc,
+  uint32_t blockSize,
+  q7_t * pResult,
+  uint32_t * pIndex)
+{
+	uint32_t i, ind=0;
+	q7_t min=SCHAR_MAX;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		if (min > pSrc[i])
+		{
+			min = pSrc[i];
+			ind = i;
+		}
+	}
+	*pResult = min;
+	*pIndex = ind;
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/power.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/power.c
new file mode 100644
index 0000000..8202e04
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/power.c
@@ -0,0 +1,61 @@
+#include "ref.h"
+
+void ref_power_f32(
+  float32_t * pSrc,
+  uint32_t blockSize,
+  float32_t * pResult)
+{
+	uint32_t i;
+	float32_t sumsq=0;
+	
+	for(i=0;i<blockSize;i++)
+	{
+			sumsq += pSrc[i] * pSrc[i];
+	}
+	*pResult = sumsq;
+}
+
+void ref_power_q31(
+  q31_t * pSrc,
+  uint32_t blockSize,
+  q63_t * pResult)
+{
+	uint32_t i;
+	q63_t sumsq=0;
+	
+	for(i=0;i<blockSize;i++)
+	{
+			sumsq += ((q63_t)pSrc[i] * pSrc[i]) >> 14;
+	}
+	*pResult = sumsq;
+}
+
+void ref_power_q15(
+  q15_t * pSrc,
+  uint32_t blockSize,
+  q63_t * pResult)
+{
+	uint32_t i;
+	q63_t sumsq=0;
+	
+	for(i=0;i<blockSize;i++)
+	{
+			sumsq += (q63_t)pSrc[i] * pSrc[i];
+	}
+	*pResult = sumsq;
+}
+
+void ref_power_q7(
+  q7_t * pSrc,
+  uint32_t blockSize,
+  q31_t * pResult)
+{
+	uint32_t i;
+	q31_t sumsq=0;
+	
+	for(i=0;i<blockSize;i++)
+	{
+			sumsq += (q31_t)pSrc[i] * pSrc[i];
+	}
+	*pResult = sumsq;
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/rms.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/rms.c
new file mode 100644
index 0000000..94943a9
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/rms.c
@@ -0,0 +1,65 @@
+#include "ref.h"
+
+void ref_rms_f32(
+  float32_t * pSrc,
+  uint32_t blockSize,
+  float32_t * pResult)
+{
+	uint32_t i;
+	float32_t sumsq=0;
+	
+	for(i=0;i<blockSize;i++)
+	{
+			sumsq += pSrc[i] * pSrc[i];
+	}
+	*pResult = sqrtf(sumsq / (float32_t)blockSize);
+}
+
+void ref_rms_q31(
+  q31_t * pSrc,
+  uint32_t blockSize,
+  q31_t * pResult)
+{
+	uint32_t i;
+    uint64_t sumsq = 0;             /* accumulator (can get never negative. changed type from q63 to uint64 */
+	q63_t tmp1;
+	q31_t tmp2;
+  
+  float help_float;
+  
+	for(i=0;i<blockSize;i++)
+	{
+			sumsq += (q63_t)pSrc[i] * pSrc[i];
+	}
+	tmp1 = (sumsq / (q63_t)blockSize) >> 31;
+	tmp2 = ref_sat_q31(tmp1);
+
+  /* GCC M0 problem: __aeabi_f2iz(QNAN) returns not 0 */
+  help_float = (sqrtf((float)tmp2 / 2147483648.0f) * 2147483648.0f);
+  /* Checking for a NAN value in help_float */
+  if (((*((int *)(&help_float))) & 0x7FC00000) == 0x7FC00000) {
+      help_float = 0;
+  }
+  *pResult = (q31_t)(help_float);
+
+//  *pResult = (q31_t)(sqrtf((float)tmp2 / 2147483648.0f) * 2147483648.0f);
+}
+
+void ref_rms_q15(
+  q15_t * pSrc,
+  uint32_t blockSize,
+  q15_t * pResult)
+{
+	uint32_t i;
+	q63_t sumsq=0;
+	q31_t tmp1;
+	q15_t tmp2;
+	
+	for(i=0;i<blockSize;i++)
+	{
+			sumsq += (q63_t)pSrc[i] * pSrc[i];
+	}
+	tmp1 = (sumsq / (q63_t)blockSize) >> 15;
+	tmp2 = ref_sat_q15(tmp1);
+	*pResult = (q15_t)(sqrtf((float)tmp2 / 32768.0f) * 32768.0f);
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/std.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/std.c
new file mode 100644
index 0000000..c0c1ba3
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/std.c
@@ -0,0 +1,74 @@
+#include "ref.h"
+
+void ref_std_f32(
+  float32_t * pSrc,
+  uint32_t blockSize,
+  float32_t * pResult)
+{
+	uint32_t i;
+	float32_t sum=0, sumsq=0;
+	
+	if (blockSize == 1)
+	{
+		*pResult = 0;
+		return;
+	}
+	
+	for(i=0;i<blockSize;i++)
+	{
+			sum += pSrc[i];
+			sumsq += pSrc[i] * pSrc[i];
+	}
+	*pResult = sqrtf((sumsq - sum * sum / (float32_t)blockSize) / ((float32_t)blockSize - 1));
+}
+
+void ref_std_q31(
+  q31_t * pSrc,
+  uint32_t blockSize,
+  q31_t * pResult)
+{
+	uint32_t i;
+	q63_t sum=0, sumsq=0;
+	q31_t in;
+	
+	if (blockSize == 1)
+	{
+		*pResult = 0;
+		return;
+	}
+	
+	for(i=0;i<blockSize;i++)
+	{
+			in = pSrc[i] >> 8;
+			sum += in;
+			sumsq += (q63_t)in * in;
+	}
+	sumsq /= (q63_t)(blockSize - 1);
+	sum = sum * sum / (q63_t)(blockSize * (blockSize - 1));
+	*pResult = (q31_t)(sqrtf((float)( (sumsq - sum) >> 15) / 2147483648.0f ) * 2147483648.0f);
+}
+
+void ref_std_q15(
+  q15_t * pSrc,
+  uint32_t blockSize,
+  q15_t * pResult)
+{
+	uint32_t i;
+	q31_t sum=0;
+	q63_t sumsq=0;
+	
+	if (blockSize == 1)
+	{
+		*pResult = 0;
+		return;
+	}
+	
+	for(i=0;i<blockSize;i++)
+	{
+			sum += pSrc[i];
+			sumsq += (q63_t)pSrc[i] * pSrc[i];
+	}
+	sumsq /= (q63_t)(blockSize - 1);
+	sum = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1)));
+	*pResult = (q15_t)(sqrtf((float)ref_sat_q15( (sumsq - sum) >> 15) / 32768.0f ) * 32768.0f);
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/var.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/var.c
new file mode 100644
index 0000000..f5da3a6
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/StatisticsFunctions/var.c
@@ -0,0 +1,70 @@
+#include "ref.h"
+
+void ref_var_f32(
+  float32_t * pSrc,
+  uint32_t blockSize,
+  float32_t * pResult)
+{
+	uint32_t i;
+	float32_t sum=0, sumsq=0;
+	
+	if (blockSize == 1)
+	{
+		*pResult = 0;
+		return;
+	}
+	
+	for(i=0;i<blockSize;i++)
+	{
+			sum += pSrc[i];
+			sumsq += pSrc[i] * pSrc[i];
+	}
+	*pResult = (sumsq - sum * sum / (float32_t)blockSize) / ((float32_t)blockSize - 1);
+}
+
+void ref_var_q31(
+  q31_t * pSrc,
+  uint32_t blockSize,
+  q31_t * pResult)
+{
+	uint32_t i;
+	q63_t sum=0, sumsq=0;
+	q31_t in;
+	
+	if (blockSize == 1)
+	{
+		*pResult = 0;
+		return;
+	}
+	
+	for(i=0;i<blockSize;i++)
+	{
+			in = pSrc[i] >> 8;
+			sum += in;
+			sumsq += (q63_t)in * in;
+	}
+	*pResult = (sumsq - sum * sum / (q31_t)blockSize) / ((q31_t)blockSize - 1) >> 15;
+}
+
+void ref_var_q15(
+  q15_t * pSrc,
+  uint32_t blockSize,
+  q15_t * pResult)
+{
+	uint32_t i;
+	q31_t sum=0;
+	q63_t sumsq=0;
+	
+	if (blockSize == 1)
+	{
+		*pResult = 0;
+		return;
+	}
+	
+	for(i=0;i<blockSize;i++)
+	{
+			sum += pSrc[i];
+			sumsq += (q63_t)pSrc[i] * pSrc[i];
+	}
+	*pResult = (q31_t)((sumsq - (q63_t)sum * sum / (q63_t)blockSize) / ((q63_t)blockSize - 1)) >> 15;
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/SupportFunctions.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/SupportFunctions.c
new file mode 100644
index 0000000..9abab56
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/SupportFunctions.c
@@ -0,0 +1,6 @@
+
+#include "copy.c"
+#include "fill.c"
+#include "fixed_to_fixed.c"
+#include "fixed_to_float.c"
+#include "float_to_fixed.c"
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/copy.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/copy.c
new file mode 100644
index 0000000..08089f5
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/copy.c
@@ -0,0 +1,53 @@
+#include "ref.h"
+
+void ref_copy_f32(
+  float32_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = pSrc[i];
+	}
+}
+
+void ref_copy_q31(
+  q31_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = pSrc[i];
+	}
+}
+
+void ref_copy_q15(
+  q15_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = pSrc[i];
+	}
+}
+
+void ref_copy_q7(
+  q7_t * pSrc,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = pSrc[i];
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/fill.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/fill.c
new file mode 100644
index 0000000..6082351
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/fill.c
@@ -0,0 +1,53 @@
+#include "ref.h"
+
+void ref_fill_f32(
+  float32_t value,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = value;
+	}
+}
+
+void ref_fill_q31(
+  q31_t value,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = value;
+	}
+}
+
+void ref_fill_q15(
+  q15_t value,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = value;
+	}
+}
+
+void ref_fill_q7(
+  q7_t value,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = value;
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/fixed_to_fixed.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/fixed_to_fixed.c
new file mode 100644
index 0000000..f0c1d0f
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/fixed_to_fixed.c
@@ -0,0 +1,79 @@
+#include "ref.h"
+
+void ref_q31_to_q15(
+  q31_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = pSrc[i] >> 16;
+	}
+}
+
+void ref_q31_to_q7(
+  q31_t * pSrc,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = pSrc[i] >> 24;
+	}
+}
+
+void ref_q15_to_q31(
+  q15_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ((q31_t)pSrc[i]) << 16;
+	}
+}
+
+void ref_q15_to_q7(
+  q15_t * pSrc,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = pSrc[i] >> 8;
+	}
+}
+
+void ref_q7_to_q31(
+  q7_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ((q31_t)pSrc[i]) << 24;
+	}
+}
+
+void ref_q7_to_q15(
+  q7_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ((q15_t)pSrc[i]) << 8;
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/fixed_to_float.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/fixed_to_float.c
new file mode 100644
index 0000000..296a50e
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/fixed_to_float.c
@@ -0,0 +1,53 @@
+#include "ref.h"
+
+void ref_q63_to_float(
+  q63_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ((float32_t)pSrc[i]) / 9223372036854775808.0f;
+	}
+}
+
+void ref_q31_to_float(
+  q31_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ((float32_t)pSrc[i]) / 2147483648.0f;
+	}
+}
+	
+void ref_q15_to_float(
+  q15_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ((float32_t)pSrc[i]) / 32768.0f;
+	}
+}
+	
+void ref_q7_to_float(
+  q7_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		pDst[i] = ((float32_t)pSrc[i]) / 128.0f;
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/float_to_fixed.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/float_to_fixed.c
new file mode 100644
index 0000000..107556b
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/SupportFunctions/float_to_fixed.c
@@ -0,0 +1,52 @@
+#include "ref.h"
+
+void ref_float_to_q31(
+  float32_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	float32_t in;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		in = pSrc[i];
+		in *= 2147483648.0f;				//scale up
+		in += in > 0.0f ? 0.5f : -0.5f;		//round
+		pDst[i] = ref_sat_q31((q63_t)in);	//cast and saturate
+	}
+}
+	
+void ref_float_to_q15(
+  float32_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	float32_t in;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		in = pSrc[i];
+		in *= 32768.0f;
+		in += in > 0.0f ? 0.5f : -0.5f;
+		pDst[i] = ref_sat_q15((q31_t)in);
+	}
+}
+	
+void ref_float_to_q7(
+  float32_t * pSrc,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+	uint32_t i;
+	float32_t in;
+	
+	for(i=0;i<blockSize;i++)
+	{
+		in = pSrc[i];
+		in *= 128.0f;
+		in += in > 0.0f ? 0.5f : -0.5f;
+		pDst[i] = ref_sat_q7((q15_t)in);
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/TransformFunctions/TransformFunctions.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/TransformFunctions/TransformFunctions.c
new file mode 100644
index 0000000..d100918
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/TransformFunctions/TransformFunctions.c
@@ -0,0 +1,4 @@
+
+#include "cfft.c"
+#include "dct4.c"
+#include "rfft.c"
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/TransformFunctions/bitreversal.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/TransformFunctions/bitreversal.c
new file mode 100644
index 0000000..b5a4068
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/TransformFunctions/bitreversal.c
@@ -0,0 +1,30 @@
+#include "ref.h"
+
+
+;/*    
+;* @brief  In-place bit reversal function.   
+;* @param[in, out] *pSrc        points to the in-place buffer of unknown 32-bit data type. 
+;* @param[in]      bitRevLen    bit reversal table length
+;* @param[in]      *pBitRevTab  points to bit reversal table.   
+;* @return none.   
+;*/
+void ref_arm_bitreversal_32(uint32_t *pSrc, uint32_t bitRevLen, uint32_t *pBitRevTab)
+{
+	uint32_t a,b,i,tmp;
+	
+	for(i=0; i<bitRevLen; i++) 
+	{
+		 a = pBitRevTab[2*i];
+		 b = pBitRevTab[2*i + 1];
+
+	//real
+		 tmp = pSrc[a];
+		 pSrc[a] = pSrc[b];
+		 pSrc[b] = tmp;
+
+	//complex
+		 tmp = pSrc[a+1];
+		 pSrc[a+1] = pSrc[b+1];
+		 pSrc[b+1] = tmp;
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/TransformFunctions/cfft.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/TransformFunctions/cfft.c
new file mode 100644
index 0000000..ede4da6
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/TransformFunctions/cfft.c
@@ -0,0 +1,598 @@
+#include "ref.h"
+#include "arm_const_structs.h"
+	 
+void ref_cfft_f32(
+   const arm_cfft_instance_f32 * S, 
+   float32_t * p1,
+   uint8_t ifftFlag,
+   uint8_t bitReverseFlag)
+{
+	int n, mmax, m, j, istep, i;
+	float32_t wtemp, wr, wpr, wpi, wi, theta;
+	float32_t tempr, tempi;
+	float32_t *  data = p1;
+	uint32_t N = S->fftLen;
+	int32_t dir = (ifftFlag) ? -1 : 1;
+
+	// decrement pointer since the original version used fortran style indexing.
+	data--;
+
+	n = N << 1;
+	j = 1;
+	for (i = 1; i < n; i += 2) {
+		if (j > i) {
+			tempr = data[j];     data[j] = data[i];     data[i] = tempr;
+			tempr = data[j+1]; data[j+1] = data[i+1]; data[i+1] = tempr;
+		}
+		m = n >> 1;
+		while (m >= 2 && j > m) {
+			j -= m;
+			m >>= 1;
+		}
+		j += m;
+	}
+	mmax = 2;
+	while (n > mmax) {
+		istep = 2*mmax;
+		theta = -6.283185307179586f/(dir*mmax);
+		wtemp = sinf(0.5f*theta);
+		wpr = -2.0f*wtemp*wtemp;
+		wpi = sinf(theta);
+		wr = 1.0f;
+		wi = 0.0f;
+		for (m = 1; m < mmax; m += 2) {
+			for (i = m; i <= n; i += istep) {
+				j =i + mmax;
+				tempr = wr*data[j]   - wi*data[j+1];
+				tempi = wr*data[j+1] + wi*data[j];
+				data[j]   = data[i]   - tempr;
+				data[j+1] = data[i+1] - tempi;
+				data[i] += tempr;
+				data[i+1] += tempi;
+			}
+			wr = (wtemp = wr)*wpr - wi*wpi + wr;
+			wi = wi*wpr + wtemp*wpi + wi;
+		}
+		mmax = istep;
+	}
+
+	// Inverse transform is scaled by 1/N
+	if (ifftFlag)
+	{
+		data++;
+		for(i = 0; i<2*N; i++)
+		{
+			data[i] /= N;
+		}
+	}
+}
+
+void ref_cfft_q31(
+	const arm_cfft_instance_q31 * S,
+    q31_t * p1,
+    uint8_t ifftFlag,
+    uint8_t bitReverseFlag)
+{
+	uint32_t i;
+	float32_t *fSrc = (float32_t*)p1;
+	
+	for(i=0;i<S->fftLen*2;i++)
+	{
+		//read the q31 data, cast to float, scale down for float
+		fSrc[i] = (float32_t)p1[i] / 2147483648.0f;
+	}
+	
+	switch(S->fftLen)
+	{
+   case 16: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len16, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+   
+   case 32: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len32, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+   
+   case 64: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len64, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+   
+   case 128: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len128, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+   
+   case 256: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len256, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+   
+   case 512: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len512, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+   
+   case 1024: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len1024, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+   
+   case 2048: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len2048, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+   
+   case 4096: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len4096, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+	}
+	
+	if (ifftFlag)
+	{
+		for(i=0;i<S->fftLen*2;i++)
+		{
+			//read the float data, scale up for q31, cast to q31
+			p1[i] = (q31_t)( fSrc[i] * 2147483648.0f );
+		}
+	}
+	else
+	{
+		for(i=0;i<S->fftLen*2;i++)
+		{
+			//read the float data, scale up for q31, cast to q31
+			p1[i] = (q31_t)( fSrc[i] * 2147483648.0f / (float32_t)S->fftLen);
+		}
+	}
+}
+
+void ref_cfft_q15(
+	const arm_cfft_instance_q15 * S,
+    q15_t * pSrc,
+    uint8_t ifftFlag,
+    uint8_t bitReverseFlag)
+{
+	uint32_t i;
+	float32_t *fSrc = (float32_t*)pSrc;
+	
+	for(i=0;i<S->fftLen*2;i++)
+	{
+		//read the q15 data, cast to float, scale down for float, place in temporary buffer
+		scratchArray[i] = (float32_t)pSrc[i] / 32768.0f;
+	}
+	
+	for(i=0;i<S->fftLen*2;i++)
+	{
+		//copy from temp buffer to final buffer
+		fSrc[i] = scratchArray[i];
+	}
+	
+	switch(S->fftLen)
+	{
+   case 16: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len16, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+   
+   case 32: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len32, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+   
+   case 64: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len64, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+   
+   case 128: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len128, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+   
+   case 256: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len256, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+   
+   case 512: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len512, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+   
+   case 1024: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len1024, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+   
+   case 2048: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len2048, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+   
+   case 4096: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len4096, fSrc, ifftFlag, bitReverseFlag);
+		 break;
+	}
+	
+	if (ifftFlag)
+	{
+		for(i=0;i<S->fftLen*2;i++)
+		{
+			//read the float data, scale up for q15, cast to q15
+			pSrc[i] = (q15_t)( fSrc[i] * 32768.0f );
+		}
+	}
+	else
+	{
+		for(i=0;i<S->fftLen*2;i++)
+		{
+			//read the float data, scale up for q15, cast to q15
+			pSrc[i] = (q15_t)( fSrc[i] * 32768.0f / (float32_t)S->fftLen);
+		}
+	}
+}
+
+void ref_cfft_radix2_f32(
+	const arm_cfft_radix2_instance_f32 * S,
+	float32_t * pSrc)
+{
+	switch(S->fftLen)
+	{
+   case 16: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len16, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 32: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len32, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 64: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len64, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 128: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len128, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 256: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len256, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 512: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len512, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 1024: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len1024, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 2048: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len2048, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 4096: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len4096, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+	}
+}
+
+void ref_cfft_radix2_q31(
+	const arm_cfft_radix2_instance_q31 * S,
+	q31_t * pSrc)
+{
+	uint32_t i;
+	float32_t *fSrc = (float32_t*)pSrc;
+	
+	for(i=0;i<S->fftLen*2;i++)
+	{
+		//read the q31 data, cast to float, scale down for float
+		fSrc[i] = (float32_t)pSrc[i] / 2147483648.0f;
+	}
+	
+	switch(S->fftLen)
+	{
+   case 16: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len16, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 32: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len32, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 64: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len64, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 128: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len128, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 256: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len256, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 512: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len512, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 1024: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len1024, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 2048: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len2048, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 4096: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len4096, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+	}
+	
+	if (S->ifftFlag)
+	{
+		for(i=0;i<S->fftLen*2;i++)
+		{
+			//read the float data, scale up for q31, cast to q31
+			pSrc[i] = (q31_t)( fSrc[i] * 2147483648.0f );
+		}
+	}
+	else
+	{
+		for(i=0;i<S->fftLen*2;i++)
+		{
+			//read the float data, scale up for q31, cast to q31
+			pSrc[i] = (q31_t)( fSrc[i] * 2147483648.0f / (float32_t)S->fftLen);
+		}
+	}
+}
+
+void ref_cfft_radix2_q15(
+	const arm_cfft_radix2_instance_q15 * S,
+	q15_t * pSrc)
+{
+	uint32_t i;
+	float32_t *fSrc = (float32_t*)pSrc;
+	
+	for(i=0;i<S->fftLen*2;i++)
+	{
+		//read the q15 data, cast to float, scale down for float, place in temporary buffer
+		scratchArray[i] = (float32_t)pSrc[i] / 32768.0f;
+	}
+	
+	for(i=0;i<S->fftLen*2;i++)
+	{
+		//copy from temp buffer to final buffer
+		fSrc[i] = scratchArray[i];
+	}
+	
+	switch(S->fftLen)
+	{
+   case 16: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len16, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 32: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len32, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 64: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len64, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 128: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len128, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 256: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len256, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 512: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len512, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 1024: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len1024, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 2048: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len2048, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 4096: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len4096, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+	}
+	
+	if (S->ifftFlag)
+	{
+		for(i=0;i<S->fftLen*2;i++)
+		{
+		//read the float data, scale up for q15, cast to q15
+		pSrc[i] = (q15_t)( fSrc[i] * 32768.0f );
+		}
+	}
+	else
+	{
+		for(i=0;i<S->fftLen*2;i++)
+		{
+		//read the float data, scale up for q15, cast to q15
+		pSrc[i] = (q15_t)( fSrc[i] * 32768.0f / (float32_t)S->fftLen);
+		}
+	}
+}
+
+void ref_cfft_radix4_f32(
+	const arm_cfft_radix4_instance_f32 * S,
+	float32_t * pSrc)
+{		
+	switch(S->fftLen)
+	{
+   case 16: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len16, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 32: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len32, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 64: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len64, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 128: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len128, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 256: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len256, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 512: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len512, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 1024: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len1024, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 2048: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len2048, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 4096: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len4096, pSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+	}
+}
+
+void ref_cfft_radix4_q31(
+	const arm_cfft_radix4_instance_q31 * S,
+	q31_t * pSrc)
+{
+	uint32_t i;
+	float32_t *fSrc = (float32_t*)pSrc;
+	
+	for(i=0;i<S->fftLen*2;i++)
+	{
+		//read the q31 data, cast to float, scale down for float
+		fSrc[i] = (float32_t)pSrc[i] / 2147483648.0f;
+	}
+	
+	switch(S->fftLen)
+	{
+   case 16: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len16, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 32: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len32, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 64: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len64, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 128: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len128, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 256: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len256, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 512: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len512, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 1024: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len1024, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 2048: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len2048, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 4096: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len4096, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+	}
+	
+	if (S->ifftFlag)
+	{
+		for(i=0;i<S->fftLen*2;i++)
+		{
+			//read the float data, scale up for q31, cast to q31
+			pSrc[i] = (q31_t)( fSrc[i] * 2147483648.0f );
+		}
+	}
+	else
+	{
+		for(i=0;i<S->fftLen*2;i++)
+		{
+			//read the float data, scale up for q31, cast to q31
+			pSrc[i] = (q31_t)( fSrc[i] * 2147483648.0f / (float32_t)S->fftLen);
+		}
+	}
+}
+
+void ref_cfft_radix4_q15(
+	const arm_cfft_radix4_instance_q15 * S,
+	q15_t * pSrc)
+{
+	uint32_t i;
+	float32_t *fSrc = (float32_t*)pSrc;
+	
+	for(i=0;i<S->fftLen*2;i++)
+	{
+		//read the q15 data, cast to float, scale down for float, place in temporary buffer
+		scratchArray[i] = (float32_t)pSrc[i] / 32768.0f;
+	}
+	
+	for(i=0;i<S->fftLen*2;i++)
+	{
+		//copy from temp buffer to final buffer
+		fSrc[i] = scratchArray[i];
+	}
+	
+	switch(S->fftLen)
+	{
+   case 16: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len16, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 32: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len32, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 64: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len64, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 128: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len128, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 256: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len256, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 512: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len512, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 1024: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len1024, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 2048: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len2048, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+   
+   case 4096: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len4096, fSrc, S->ifftFlag, S->bitReverseFlag);
+		 break;
+	}
+	
+	if (S->ifftFlag)
+	{
+		for(i=0;i<S->fftLen*2;i++)
+		{
+		//read the float data, scale up for q15, cast to q15
+		pSrc[i] = (q15_t)( fSrc[i] * 32768.0f );
+		}
+	}
+	else
+	{
+		for(i=0;i<S->fftLen*2;i++)
+		{
+		//read the float data, scale up for q15, cast to q15
+		pSrc[i] = (q15_t)( fSrc[i] * 32768.0f / (float32_t)S->fftLen);
+		}
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/TransformFunctions/dct4.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/TransformFunctions/dct4.c
new file mode 100644
index 0000000..9c1f207
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/TransformFunctions/dct4.c
@@ -0,0 +1,89 @@
+#include "ref.h"
+
+void ref_dct4_f32(
+  const arm_dct4_instance_f32 * S,
+  float32_t * pState,
+  float32_t * pInlineBuffer)
+{
+	uint32_t n,k;
+	float32_t sum;
+   float32_t pi_by_N = 3.14159265358979f / (float32_t)S->N;
+   float32_t tmp;
+   float32_t normalize = sqrtf(2.0f / (float32_t)S->N);
+	
+	for(k=0;k<S->N;k++)
+	{
+		sum=0.0f;
+      tmp = ((float32_t)k + 0.5f)*pi_by_N;
+		for(n=0;n<S->N;n++)
+		{
+			sum += pInlineBuffer[n] * cosf(tmp * ((float32_t)n + 0.5f));
+		}
+		scratchArray[k] = normalize * sum;
+	}
+	
+	for(k=0;k<S->N;k++)
+	{
+		pInlineBuffer[k] = scratchArray[k];
+	}
+}
+
+void ref_dct4_q31(
+  const arm_dct4_instance_q31 * S,
+  q31_t * pState,
+  q31_t * pInlineBuffer)
+{
+	arm_dct4_instance_f32 SS;
+	float32_t *fSrc = (float32_t*)pInlineBuffer;
+	uint32_t i;
+	
+	SS.N = S->N;
+	
+	for(i=0;i<S->N;i++)
+	{
+		//read the q31 data, cast to float, scale down for float
+		fSrc[i] = (float32_t)pInlineBuffer[i] / 2147483648.0f;
+	}
+	
+	ref_dct4_f32(&SS,(float32_t*)0,fSrc);
+	
+	for(i=0;i<S->N;i++)
+	{
+		fSrc[i] = fSrc[i] * 2147483648.0f / (float32_t)S->N ;
+		fSrc[i] += (fSrc[i] > 0) ? 0.5f : -0.5f;
+		pInlineBuffer[i] = (q31_t)fSrc[i];
+	}
+}
+
+void ref_dct4_q15(
+  const arm_dct4_instance_q15 * S,
+  q15_t * pState,
+  q15_t * pInlineBuffer)
+{
+	arm_dct4_instance_f32 SS;
+	float32_t *fSrc = (float32_t*)pInlineBuffer;
+	uint32_t i;
+	
+	SS.N = S->N;
+	
+	for(i=0;i<S->N;i++)
+	{
+		//read the q15 data, cast to float, scale down for float, place in temporary buffer
+		scratchArray[i] = (float32_t)pInlineBuffer[i] / 32768.0f;
+	}
+	
+	for(i=0;i<S->N;i++)
+	{
+		//copy from temp buffer to final buffer
+		fSrc[i] = scratchArray[i];
+	}
+	
+	ref_dct4_f32(&SS,(float32_t*)0,fSrc);
+	
+	for(i=0;i<S->N;i++)
+	{
+		fSrc[i] = fSrc[i] * 32768.0f / (float32_t)S->N;
+		fSrc[i] += (fSrc[i] > 0) ? 0.5f : -0.5f;
+		pInlineBuffer[i] = (q15_t)fSrc[i];
+	}
+}
diff --git a/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/TransformFunctions/rfft.c b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/TransformFunctions/rfft.c
new file mode 100644
index 0000000..79738f0
--- /dev/null
+++ b/Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src/TransformFunctions/rfft.c
@@ -0,0 +1,302 @@
+#include "ref.h"	
+#include "arm_const_structs.h"
+	
+void ref_rfft_f32(
+	arm_rfft_instance_f32 * S,
+  float32_t * pSrc,
+  float32_t * pDst)
+{
+	uint32_t i;
+	
+	if (S->ifftFlagR)
+	{
+		for(i=0;i<S->fftLenReal*2;i++)
+		{
+			pDst[i] = pSrc[i];
+		}
+	}
+	else
+	{
+		for(i=0;i<S->fftLenReal;i++)
+		{
+			pDst[2*i+0] = pSrc[i];
+			pDst[2*i+1] = 0.0f;
+		}
+	}
+	
+	switch(S->fftLenReal)
+	{   
+   case 128: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len128, pDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+   
+   case 512: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len512, pDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+   
+   case 2048: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len2048, pDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+   
+   case 8192: 
+		 ref_cfft_f32(&ref_cfft_sR_f32_len8192, pDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+	}
+	
+	if (S->ifftFlagR)
+	{
+		//throw away the imaginary part which should be all zeros
+		for(i=0;i<S->fftLenReal;i++)
+		{
+			pDst[i] = pDst[2*i];
+		}
+	}
+}
+	
+void ref_rfft_fast_f32(
+	arm_rfft_fast_instance_f32 * S,
+	float32_t * p, float32_t * pOut,
+	uint8_t ifftFlag)
+{
+	uint32_t i,j;
+	
+	if (ifftFlag)
+	{
+		for(i=0;i<S->fftLenRFFT;i++)
+		{
+			pOut[i] = p[i];
+		}
+		//unpack first sample's complex part into middle sample's real part
+		pOut[S->fftLenRFFT] = pOut[1];
+		pOut[S->fftLenRFFT+1] = 0;
+		pOut[1] = 0;
+		j=4;
+		for(i = S->fftLenRFFT / 2 + 1;i < S->fftLenRFFT;i++)
+		{
+			pOut[2*i+0] = p[2*i+0 - j];
+			pOut[2*i+1] = -p[2*i+1 - j];
+			j+=4;
+		}
+	}
+	else
+	{
+		for(i=0;i<S->fftLenRFFT;i++)
+		{
+			pOut[2*i+0] = p[i];
+			pOut[2*i+1] = 0.0f;
+		}
+	}
+	
+	switch(S->fftLenRFFT)
+	{   
+   case 32: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len32, pOut, ifftFlag, 1);
+		 break;
+   
+   case 64: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len64, pOut, ifftFlag, 1);
+		 break;
+   
+   case 128: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len128, pOut, ifftFlag, 1);
+		 break;
+   
+   case 256: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len256, pOut, ifftFlag, 1);
+		 break;
+   
+   case 512: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len512, pOut, ifftFlag, 1);
+		 break;
+   
+   case 1024: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len1024, pOut, ifftFlag, 1);
+		 break;
+   
+   case 2048: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len2048, pOut, ifftFlag, 1);
+		 break;
+   
+   case 4096: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len4096, pOut, ifftFlag, 1);
+		 break;
+	}
+	
+	if (ifftFlag)
+	{
+		//throw away the imaginary part which should be all zeros
+		for(i=0;i<S->fftLenRFFT;i++)
+		{
+			pOut[i] = pOut[2*i];
+		}
+	}
+	else
+	{
+		//pack last sample's real part into first sample's complex part
+		pOut[1] = pOut[S->fftLenRFFT];
+	}
+}
+	
+void ref_rfft_q31(
+  const arm_rfft_instance_q31 * S,
+  q31_t * pSrc,
+  q31_t * pDst)
+{
+	uint32_t i;
+	float32_t *fDst = (float32_t*)pDst;
+	
+	if (S->ifftFlagR)
+	{
+		for(i=0;i<S->fftLenReal*2;i++)
+		{
+			fDst[i] = (float32_t)pSrc[i] / 2147483648.0f;
+		}
+	}
+	else
+	{
+		for(i=0;i<S->fftLenReal;i++)
+		{
+			fDst[2*i+0] = (float32_t)pSrc[i] / 2147483648.0f;
+			fDst[2*i+1] = 0.0f;
+		}
+	}
+	
+	switch(S->fftLenReal)
+	{
+   case 32: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len32, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+	 
+   case 64: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len64, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+	 
+   case 128: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len128, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+	 
+   case 256: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len256, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+	 
+   case 512: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len512, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+	 
+   case 1024: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len1024, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+   
+   case 2048: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len2048, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+   
+   case 4096: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len4096, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+   
+   case 8192: 
+		 ref_cfft_f32(&ref_cfft_sR_f32_len8192, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+	}
+	
+	if (S->ifftFlagR)
+	{
+		//throw away the imaginary part which should be all zeros		
+		for(i=0;i<S->fftLenReal;i++)
+		{
+			//read the float data, scale up for q31, cast to q31
+			pDst[i] = (q31_t)( fDst[2*i] * 2147483648.0f);
+		}
+	}
+	else
+	{
+		for(i=0;i<S->fftLenReal;i++)
+		{
+			//read the float data, scale up for q31, cast to q31
+			pDst[i] = (q31_t)( fDst[i] * 2147483648.0f / (float32_t)S->fftLenReal);
+		}
+	}
+}
+
+void ref_rfft_q15(
+  const arm_rfft_instance_q15 * S,
+  q15_t * pSrc,
+  q15_t * pDst)
+{
+	uint32_t i;
+	float32_t *fDst = (float32_t*)pDst;
+	
+	
+	if (S->ifftFlagR)
+	{
+		for(i=0;i<S->fftLenReal*2;i++)
+		{
+			fDst[i] = (float32_t)pSrc[i] / 32768.0f;
+		}
+	}
+	else
+	{
+		for(i=0;i<S->fftLenReal;i++)
+		{
+			//read the q15 data, cast to float, scale down for float
+			fDst[2*i+0] = (float32_t)pSrc[i] / 32768.0f;
+			fDst[2*i+1] = 0.0f;
+		}
+	}
+	
+	switch(S->fftLenReal)
+	{   
+   case 32: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len32, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+	 
+   case 64: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len64, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+	 
+   case 128: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len128, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+	 
+   case 256: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len256, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+	 
+   case 512: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len512, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+	 
+   case 1024: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len1024, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+   
+   case 2048: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len2048, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+   
+   case 4096: 
+		 ref_cfft_f32(&arm_cfft_sR_f32_len4096, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+   
+   case 8192: 
+		 ref_cfft_f32(&ref_cfft_sR_f32_len8192, fDst, S->ifftFlagR, S->bitReverseFlagR);
+		 break;
+	}
+	
+	if (S->ifftFlagR)
+	{
+		//throw away the imaginary part which should be all zeros
+		for(i=0;i<S->fftLenReal;i++)
+		{
+			pDst[i] = (q15_t)( fDst[2*i] * 32768.0f);
+		}
+	}
+	else
+	{
+		for(i=0;i<S->fftLenReal;i++)
+		{
+			pDst[i] = (q15_t)( fDst[i] * 32768.0f / (float32_t)S->fftLenReal);
+		}
+	}
+}
author	joshua <joshua@joshuayun.com>	2023-12-30 23:54:31 -0500
committer	joshua <joshua@joshuayun.com>	2023-12-30 23:54:31 -0500
commit	86608c6770cf08c138a2bdab5855072f64be09ef (patch)
tree	494a61b3ef37e76f9235a0d10f5c93d97290a35f /Drivers/CMSIS/DSP/DSP_Lib_TestSuite/RefLibs/src
download	sdr-software-86608c6770cf08c138a2bdab5855072f64be09ef.tar.gz