From cf065d439efed2141aaf09454beb445e80dc7539 Mon Sep 17 00:00:00 2001
From: eZio Pan <eziopan@qq.com>
Date: Tue, 12 Mar 2024 00:54:26 +0800
Subject: [PATCH 01/17] stm32 CORDIC: ZeroOverhead q1.31 1 arg 1 res mode

---
 embassy-stm32/src/cordic.rs | 460 ++++++++++++++++++++++++++++++++++++
 embassy-stm32/src/lib.rs    |   2 +
 2 files changed, 462 insertions(+)
 create mode 100644 embassy-stm32/src/cordic.rs

diff --git a/embassy-stm32/src/cordic.rs b/embassy-stm32/src/cordic.rs
new file mode 100644
index 000000000..952ee187a
--- /dev/null
+++ b/embassy-stm32/src/cordic.rs
@@ -0,0 +1,460 @@
+//! CORDIC co-processor
+
+use crate::peripherals;
+use embassy_hal_internal::{into_ref, Peripheral, PeripheralRef};
+
+pub use enums::*;
+
+mod enums {
+    /// CORDIC function
+    #[allow(missing_docs)]
+    #[derive(Clone, Copy)]
+    pub enum Function {
+        Cos = 0,
+        Sin,
+        Phase,
+        Modulus,
+        Arctan,
+        Cosh,
+        Sinh,
+        Arctanh,
+        Ln,
+        Sqrt,
+    }
+
+    /// CORDIC precision
+    #[allow(missing_docs)]
+    #[derive(Clone, Copy)]
+    pub enum Precision {
+        Iters4 = 1,
+        Iters8,
+        Iters12,
+        Iters16,
+        Iters20,
+        Iters24,
+        Iters28,
+        Iters32,
+        Iters36,
+        Iters40,
+        Iters44,
+        Iters48,
+        Iters52,
+        Iters56,
+        Iters60,
+    }
+
+    /// CORDIC scale
+    #[allow(non_camel_case_types)]
+    #[allow(missing_docs)]
+    #[derive(Clone, Copy, Default)]
+    pub enum Scale {
+        #[default]
+        A1_R1 = 0,
+        A1o2_R2,
+        A1o4_R4,
+        A1o8_R8,
+        A1o16_R16,
+        A1o32_R32,
+        A1o64_R64,
+        A1o128_R128,
+    }
+
+    /// CORDIC argument/result count
+    #[allow(missing_docs)]
+    #[derive(Clone, Copy, Default)]
+    pub enum Count {
+        #[default]
+        One,
+        Two,
+    }
+
+    /// CORDIC argument/result data width
+    #[allow(missing_docs)]
+    #[derive(Clone, Copy)]
+    pub enum Width {
+        Bits32,
+        Bits16,
+    }
+
+    /// Cordic driver running mode
+    #[derive(Clone, Copy)]
+    pub enum Mode {
+        /// After caculation start, a read to RDATA register will block AHB until the caculation finished
+        ZeroOverhead,
+
+        /// Use CORDIC interrupt to trigger a read result value
+        Interrupt,
+
+        /// Use DMA to write/read value
+        Dma,
+    }
+}
+
+/// Low-level CORDIC access.
+#[cfg(feature = "unstable-pac")]
+pub mod low_level {
+    pub use super::sealed::*;
+}
+
+pub(crate) mod sealed {
+    use super::*;
+    use crate::pac::cordic::vals;
+
+    /// Cordic instance
+    pub trait Instance {
+        /// Get access to CORDIC registers
+        fn regs() -> crate::pac::cordic::Cordic;
+
+        /// Set Function value
+        fn set_func(&self, func: Function) {
+            Self::regs()
+                .csr()
+                .modify(|v| v.set_func(vals::Func::from_bits(func as u8)));
+        }
+
+        /// Set Precision value
+        fn set_precision(&self, precision: Precision) {
+            Self::regs()
+                .csr()
+                .modify(|v| v.set_precision(vals::Precision::from_bits(precision as u8)))
+        }
+
+        /// Set Scale value
+        fn set_scale(&self, scale: Scale) {
+            Self::regs()
+                .csr()
+                .modify(|v| v.set_scale(vals::Scale::from_bits(scale as u8)))
+        }
+
+        /// Enable global interrupt
+        fn enable_irq(&self) {
+            Self::regs().csr().modify(|v| v.set_ien(true))
+        }
+
+        /// Disable global interrupt
+        fn disable_irq(&self) {
+            Self::regs().csr().modify(|v| v.set_ien(false))
+        }
+
+        /// Enable Read DMA
+        fn enable_read_dma(&self) {
+            Self::regs().csr().modify(|v| {
+                v.set_dmaren(true);
+            })
+        }
+
+        /// Disable Read DMA
+        fn disable_read_dma(&self) {
+            Self::regs().csr().modify(|v| {
+                v.set_dmaren(false);
+            })
+        }
+
+        /// Enable Write DMA
+        fn enable_write_dma(&self) {
+            Self::regs().csr().modify(|v| {
+                v.set_dmawen(true);
+            })
+        }
+
+        /// Disable Write DMA
+        fn disable_write_dma(&self) {
+            Self::regs().csr().modify(|v| {
+                v.set_dmawen(false);
+            })
+        }
+
+        /// Set NARGS value
+        fn set_argument_count(&self, n: Count) {
+            Self::regs().csr().modify(|v| {
+                v.set_nargs(match n {
+                    Count::One => vals::Num::NUM1,
+                    Count::Two => vals::Num::NUM2,
+                })
+            })
+        }
+
+        /// Set NRES value
+        fn set_result_count(&self, n: Count) {
+            Self::regs().csr().modify(|v| {
+                v.set_nres(match n {
+                    Count::One => vals::Num::NUM1,
+                    Count::Two => vals::Num::NUM2,
+                });
+            })
+        }
+
+        /// Set ARGSIZE and RESSIZE value
+        fn set_data_width(&self, arg: Width, res: Width) {
+            Self::regs().csr().modify(|v| {
+                v.set_argsize(match arg {
+                    Width::Bits32 => vals::Size::BITS32,
+                    Width::Bits16 => vals::Size::BITS16,
+                });
+                v.set_ressize(match res {
+                    Width::Bits32 => vals::Size::BITS32,
+                    Width::Bits16 => vals::Size::BITS16,
+                })
+            })
+        }
+
+        /// Read RRDY flag
+        fn ready_to_read(&self) -> bool {
+            Self::regs().csr().read().rrdy()
+        }
+
+        /// Write value to WDATA
+        fn write_argument(&self, arg: u32) {
+            Self::regs().wdata().write_value(arg)
+        }
+
+        /// Read value from RDATA
+        fn read_result(&self) -> u32 {
+            Self::regs().rdata().read()
+        }
+    }
+}
+
+/// CORDIC driver
+pub struct Cordic<'d, T: Instance> {
+    cordic: PeripheralRef<'d, T>,
+    config: Config,
+    //state: State,
+}
+
+/// CORDIC instance trait
+pub trait Instance: sealed::Instance + Peripheral<P = Self> + crate::rcc::RccPeripheral {}
+
+/// CORDIC configuration
+pub struct Config {
+    function: Function,
+    precision: Precision,
+    scale: Scale,
+    mode: Mode,
+    first_result: bool,
+}
+
+// CORDIC running state
+//struct State {
+//    input_buf: [u32; 8],
+//    buf_len: usize,
+//}
+
+impl Config {
+    /// Create a config for Cordic driver
+    pub fn new(function: Function, precision: Precision, scale: Option<Scale>, mode: Mode, first_result: bool) -> Self {
+        Self {
+            function,
+            precision,
+            scale: scale.unwrap_or_default(),
+            mode,
+            first_result,
+        }
+    }
+
+    fn check_scale(&self) -> bool {
+        let scale_raw = self.scale as u8;
+
+        match self.function {
+            Function::Cos | Function::Sin | Function::Phase | Function::Modulus => 0 == scale_raw,
+            Function::Arctan => (0..=7).contains(&scale_raw),
+            Function::Cosh | Function::Sinh | Function::Arctanh => 1 == scale_raw,
+            Function::Ln => (1..=4).contains(&scale_raw),
+            Function::Sqrt => (0..=2).contains(&scale_raw),
+        }
+    }
+}
+
+impl<'d, T: Instance> Cordic<'d, T> {
+    /// Create a Cordic driver instance
+    ///
+    /// Note:  
+    /// If you need a periperhal -> CORDIC -> peripehral mode,  
+    /// you may want to set Cordic into [Mode::ZeroOverhead] mode, and add extra arguemnts with [Self::extra_config]
+    pub fn new(cordic: impl Peripheral<P = T> + 'd, config: Config) -> Self {
+        T::enable_and_reset();
+
+        into_ref!(cordic);
+
+        if !config.check_scale() {
+            panic!("Scale value is not compatible with Function")
+        }
+
+        let mut instance = Self {
+            cordic,
+            config,
+            // state: State {
+            //     input_buf: [0u32; 8],
+            //     buf_len: 0,
+            // },
+        };
+
+        instance.reconfigure();
+
+        instance
+    }
+
+    /// Set a new config for Cordic driver  
+    pub fn set_config(&mut self, config: Config) {
+        self.config = config;
+        self.reconfigure();
+    }
+
+    /// Set extra config for data count and data width.
+    pub fn extra_config(&mut self, arg_cnt: Count, arg_width: Width, res_width: Width) {
+        let peri = &self.cordic;
+        peri.set_argument_count(arg_cnt);
+        peri.set_data_width(arg_width, res_width);
+    }
+
+    fn reconfigure(&mut self) {
+        let peri = &self.cordic;
+        let config = &self.config;
+
+        if peri.ready_to_read() {
+            warn!("At least 1 result hasn't been read, reconfigure will cause DATA LOST");
+        };
+
+        peri.disable_irq();
+        peri.disable_write_dma();
+        peri.disable_read_dma();
+
+        // clean RRDY flag
+        while peri.ready_to_read() {
+            peri.read_result();
+        }
+
+        peri.set_func(config.function);
+        peri.set_precision(config.precision);
+        peri.set_scale(config.scale);
+        if config.first_result {
+            peri.set_result_count(Count::One)
+        } else {
+            peri.set_result_count(Count::Two)
+        }
+
+        match config.mode {
+            Mode::ZeroOverhead => (),
+            Mode::Interrupt => {
+                peri.enable_irq();
+            }
+            Mode::Dma => {
+                peri.enable_write_dma();
+                peri.enable_read_dma();
+            }
+        }
+
+        //self.state.input_buf.fill(0u32);
+    }
+
+    /// Run a CORDIC calculation
+    pub fn calc_32bit(&mut self, arg1s: &[f64], arg2s: Option<&[f64]>, output: &mut [f64]) -> usize {
+        match self.config.mode {
+            Mode::ZeroOverhead => {
+                if arg2s.is_none() {
+                    self.cordic.set_argument_count(Count::One);
+
+                    self.cordic.set_result_count(if self.config.first_result {
+                        if output.len() < arg1s.len() {
+                            panic!("Output buf length is not long enough")
+                        }
+                        Count::One
+                    } else {
+                        if output.len() < 2 * arg1s.len() {
+                            panic!("Output buf length is not long enough")
+                        }
+                        Count::Two
+                    });
+
+                    let mut cnt = 0;
+
+                    for &arg in arg1s.iter() {
+                        self.cordic.write_argument(f64_to_q1_31(arg));
+                        output[cnt] = q1_31_to_f64(self.cordic.read_result());
+                        cnt += 1;
+                    }
+
+                    cnt
+                } else {
+                    todo!()
+                }
+            }
+            Mode::Interrupt => todo!(),
+            Mode::Dma => todo!(),
+        }
+    }
+}
+
+impl<'d, T: Instance> Drop for Cordic<'d, T> {
+    fn drop(&mut self) {
+        T::disable();
+    }
+}
+
+foreach_interrupt!(
+    ($inst:ident, cordic, CORDIC, GLOBAL, $irq:ident) => {
+        impl Instance for peripherals::$inst {
+        }
+
+        impl sealed::Instance for peripherals::$inst {
+            fn regs() -> crate::pac::cordic::Cordic {
+                crate::pac::$inst
+            }
+        }
+    };
+);
+
+macro_rules! floating_fixed_convert {
+    ($f_to_q:ident, $q_to_f:ident, $unsigned_bin_typ:ty, $signed_bin_typ:ty, $float_ty:ty, $offset:literal, $min_positive:literal) => {
+        /// convert float point to fixed point format
+        pub fn $f_to_q(value: $float_ty) -> $unsigned_bin_typ {
+            const MIN_POSITIVE: $float_ty = unsafe { core::mem::transmute($min_positive) };
+
+            assert!(
+                (-1.0 as $float_ty) <= value,
+                "input value {} should be equal or greater than -1",
+                value
+            );
+
+            let value = if value == 1.0 as $float_ty{
+                (1.0 as $float_ty) - MIN_POSITIVE
+            } else {
+                assert!(
+                    value <= (1.0 as $float_ty) - MIN_POSITIVE,
+                    "input value {} should be equal or less than 1-2^(-{})",
+                    value, $offset
+                );
+                value
+            };
+
+            (value * ((1 as $unsigned_bin_typ << $offset) as $float_ty)) as $unsigned_bin_typ
+        }
+
+        #[inline(always)]
+        /// convert fixed point to float point format
+        pub fn $q_to_f(value: $unsigned_bin_typ) -> $float_ty {
+            // It's needed to convert from unsigned to signed first, for correct result.
+            -(value as $signed_bin_typ as $float_ty) / ((1 as $unsigned_bin_typ << $offset) as $float_ty)
+        }
+    };
+}
+
+floating_fixed_convert!(
+    f64_to_q1_31,
+    q1_31_to_f64,
+    u32,
+    i32,
+    f64,
+    31,
+    0x3E00_0000_0000_0000u64 // binary form of 1f64^(-31)
+);
+
+floating_fixed_convert!(
+    f32_to_q1_15,
+    q1_15_to_f32,
+    u16,
+    i16,
+    f32,
+    15,
+    0x3800_0000u32 // binary form of 1f32^(-15)
+);
diff --git a/embassy-stm32/src/lib.rs b/embassy-stm32/src/lib.rs
index 6a3d1c463..ae2e95435 100644
--- a/embassy-stm32/src/lib.rs
+++ b/embassy-stm32/src/lib.rs
@@ -32,6 +32,8 @@ pub mod timer;
 pub mod adc;
 #[cfg(can)]
 pub mod can;
+#[cfg(cordic)]
+pub mod cordic;
 #[cfg(crc)]
 pub mod crc;
 #[cfg(cryp)]

From b595d942442a8b267e1311bcadedc8558183aa61 Mon Sep 17 00:00:00 2001
From: eZio Pan <eziopan@qq.com>
Date: Fri, 15 Mar 2024 15:12:51 +0800
Subject: [PATCH 02/17] stm32 CORDIC: split into multiple files

---
 embassy-stm32/src/cordic.rs        | 460 -----------------------------
 embassy-stm32/src/cordic/enums.rs  |  82 +++++
 embassy-stm32/src/cordic/mod.rs    | 206 +++++++++++++
 embassy-stm32/src/cordic/sealed.rs | 116 ++++++++
 embassy-stm32/src/cordic/utils.rs  |  59 ++++
 5 files changed, 463 insertions(+), 460 deletions(-)
 delete mode 100644 embassy-stm32/src/cordic.rs
 create mode 100644 embassy-stm32/src/cordic/enums.rs
 create mode 100644 embassy-stm32/src/cordic/mod.rs
 create mode 100644 embassy-stm32/src/cordic/sealed.rs
 create mode 100644 embassy-stm32/src/cordic/utils.rs

diff --git a/embassy-stm32/src/cordic.rs b/embassy-stm32/src/cordic.rs
deleted file mode 100644
index 952ee187a..000000000
--- a/embassy-stm32/src/cordic.rs
+++ /dev/null
@@ -1,460 +0,0 @@
-//! CORDIC co-processor
-
-use crate::peripherals;
-use embassy_hal_internal::{into_ref, Peripheral, PeripheralRef};
-
-pub use enums::*;
-
-mod enums {
-    /// CORDIC function
-    #[allow(missing_docs)]
-    #[derive(Clone, Copy)]
-    pub enum Function {
-        Cos = 0,
-        Sin,
-        Phase,
-        Modulus,
-        Arctan,
-        Cosh,
-        Sinh,
-        Arctanh,
-        Ln,
-        Sqrt,
-    }
-
-    /// CORDIC precision
-    #[allow(missing_docs)]
-    #[derive(Clone, Copy)]
-    pub enum Precision {
-        Iters4 = 1,
-        Iters8,
-        Iters12,
-        Iters16,
-        Iters20,
-        Iters24,
-        Iters28,
-        Iters32,
-        Iters36,
-        Iters40,
-        Iters44,
-        Iters48,
-        Iters52,
-        Iters56,
-        Iters60,
-    }
-
-    /// CORDIC scale
-    #[allow(non_camel_case_types)]
-    #[allow(missing_docs)]
-    #[derive(Clone, Copy, Default)]
-    pub enum Scale {
-        #[default]
-        A1_R1 = 0,
-        A1o2_R2,
-        A1o4_R4,
-        A1o8_R8,
-        A1o16_R16,
-        A1o32_R32,
-        A1o64_R64,
-        A1o128_R128,
-    }
-
-    /// CORDIC argument/result count
-    #[allow(missing_docs)]
-    #[derive(Clone, Copy, Default)]
-    pub enum Count {
-        #[default]
-        One,
-        Two,
-    }
-
-    /// CORDIC argument/result data width
-    #[allow(missing_docs)]
-    #[derive(Clone, Copy)]
-    pub enum Width {
-        Bits32,
-        Bits16,
-    }
-
-    /// Cordic driver running mode
-    #[derive(Clone, Copy)]
-    pub enum Mode {
-        /// After caculation start, a read to RDATA register will block AHB until the caculation finished
-        ZeroOverhead,
-
-        /// Use CORDIC interrupt to trigger a read result value
-        Interrupt,
-
-        /// Use DMA to write/read value
-        Dma,
-    }
-}
-
-/// Low-level CORDIC access.
-#[cfg(feature = "unstable-pac")]
-pub mod low_level {
-    pub use super::sealed::*;
-}
-
-pub(crate) mod sealed {
-    use super::*;
-    use crate::pac::cordic::vals;
-
-    /// Cordic instance
-    pub trait Instance {
-        /// Get access to CORDIC registers
-        fn regs() -> crate::pac::cordic::Cordic;
-
-        /// Set Function value
-        fn set_func(&self, func: Function) {
-            Self::regs()
-                .csr()
-                .modify(|v| v.set_func(vals::Func::from_bits(func as u8)));
-        }
-
-        /// Set Precision value
-        fn set_precision(&self, precision: Precision) {
-            Self::regs()
-                .csr()
-                .modify(|v| v.set_precision(vals::Precision::from_bits(precision as u8)))
-        }
-
-        /// Set Scale value
-        fn set_scale(&self, scale: Scale) {
-            Self::regs()
-                .csr()
-                .modify(|v| v.set_scale(vals::Scale::from_bits(scale as u8)))
-        }
-
-        /// Enable global interrupt
-        fn enable_irq(&self) {
-            Self::regs().csr().modify(|v| v.set_ien(true))
-        }
-
-        /// Disable global interrupt
-        fn disable_irq(&self) {
-            Self::regs().csr().modify(|v| v.set_ien(false))
-        }
-
-        /// Enable Read DMA
-        fn enable_read_dma(&self) {
-            Self::regs().csr().modify(|v| {
-                v.set_dmaren(true);
-            })
-        }
-
-        /// Disable Read DMA
-        fn disable_read_dma(&self) {
-            Self::regs().csr().modify(|v| {
-                v.set_dmaren(false);
-            })
-        }
-
-        /// Enable Write DMA
-        fn enable_write_dma(&self) {
-            Self::regs().csr().modify(|v| {
-                v.set_dmawen(true);
-            })
-        }
-
-        /// Disable Write DMA
-        fn disable_write_dma(&self) {
-            Self::regs().csr().modify(|v| {
-                v.set_dmawen(false);
-            })
-        }
-
-        /// Set NARGS value
-        fn set_argument_count(&self, n: Count) {
-            Self::regs().csr().modify(|v| {
-                v.set_nargs(match n {
-                    Count::One => vals::Num::NUM1,
-                    Count::Two => vals::Num::NUM2,
-                })
-            })
-        }
-
-        /// Set NRES value
-        fn set_result_count(&self, n: Count) {
-            Self::regs().csr().modify(|v| {
-                v.set_nres(match n {
-                    Count::One => vals::Num::NUM1,
-                    Count::Two => vals::Num::NUM2,
-                });
-            })
-        }
-
-        /// Set ARGSIZE and RESSIZE value
-        fn set_data_width(&self, arg: Width, res: Width) {
-            Self::regs().csr().modify(|v| {
-                v.set_argsize(match arg {
-                    Width::Bits32 => vals::Size::BITS32,
-                    Width::Bits16 => vals::Size::BITS16,
-                });
-                v.set_ressize(match res {
-                    Width::Bits32 => vals::Size::BITS32,
-                    Width::Bits16 => vals::Size::BITS16,
-                })
-            })
-        }
-
-        /// Read RRDY flag
-        fn ready_to_read(&self) -> bool {
-            Self::regs().csr().read().rrdy()
-        }
-
-        /// Write value to WDATA
-        fn write_argument(&self, arg: u32) {
-            Self::regs().wdata().write_value(arg)
-        }
-
-        /// Read value from RDATA
-        fn read_result(&self) -> u32 {
-            Self::regs().rdata().read()
-        }
-    }
-}
-
-/// CORDIC driver
-pub struct Cordic<'d, T: Instance> {
-    cordic: PeripheralRef<'d, T>,
-    config: Config,
-    //state: State,
-}
-
-/// CORDIC instance trait
-pub trait Instance: sealed::Instance + Peripheral<P = Self> + crate::rcc::RccPeripheral {}
-
-/// CORDIC configuration
-pub struct Config {
-    function: Function,
-    precision: Precision,
-    scale: Scale,
-    mode: Mode,
-    first_result: bool,
-}
-
-// CORDIC running state
-//struct State {
-//    input_buf: [u32; 8],
-//    buf_len: usize,
-//}
-
-impl Config {
-    /// Create a config for Cordic driver
-    pub fn new(function: Function, precision: Precision, scale: Option<Scale>, mode: Mode, first_result: bool) -> Self {
-        Self {
-            function,
-            precision,
-            scale: scale.unwrap_or_default(),
-            mode,
-            first_result,
-        }
-    }
-
-    fn check_scale(&self) -> bool {
-        let scale_raw = self.scale as u8;
-
-        match self.function {
-            Function::Cos | Function::Sin | Function::Phase | Function::Modulus => 0 == scale_raw,
-            Function::Arctan => (0..=7).contains(&scale_raw),
-            Function::Cosh | Function::Sinh | Function::Arctanh => 1 == scale_raw,
-            Function::Ln => (1..=4).contains(&scale_raw),
-            Function::Sqrt => (0..=2).contains(&scale_raw),
-        }
-    }
-}
-
-impl<'d, T: Instance> Cordic<'d, T> {
-    /// Create a Cordic driver instance
-    ///
-    /// Note:  
-    /// If you need a periperhal -> CORDIC -> peripehral mode,  
-    /// you may want to set Cordic into [Mode::ZeroOverhead] mode, and add extra arguemnts with [Self::extra_config]
-    pub fn new(cordic: impl Peripheral<P = T> + 'd, config: Config) -> Self {
-        T::enable_and_reset();
-
-        into_ref!(cordic);
-
-        if !config.check_scale() {
-            panic!("Scale value is not compatible with Function")
-        }
-
-        let mut instance = Self {
-            cordic,
-            config,
-            // state: State {
-            //     input_buf: [0u32; 8],
-            //     buf_len: 0,
-            // },
-        };
-
-        instance.reconfigure();
-
-        instance
-    }
-
-    /// Set a new config for Cordic driver  
-    pub fn set_config(&mut self, config: Config) {
-        self.config = config;
-        self.reconfigure();
-    }
-
-    /// Set extra config for data count and data width.
-    pub fn extra_config(&mut self, arg_cnt: Count, arg_width: Width, res_width: Width) {
-        let peri = &self.cordic;
-        peri.set_argument_count(arg_cnt);
-        peri.set_data_width(arg_width, res_width);
-    }
-
-    fn reconfigure(&mut self) {
-        let peri = &self.cordic;
-        let config = &self.config;
-
-        if peri.ready_to_read() {
-            warn!("At least 1 result hasn't been read, reconfigure will cause DATA LOST");
-        };
-
-        peri.disable_irq();
-        peri.disable_write_dma();
-        peri.disable_read_dma();
-
-        // clean RRDY flag
-        while peri.ready_to_read() {
-            peri.read_result();
-        }
-
-        peri.set_func(config.function);
-        peri.set_precision(config.precision);
-        peri.set_scale(config.scale);
-        if config.first_result {
-            peri.set_result_count(Count::One)
-        } else {
-            peri.set_result_count(Count::Two)
-        }
-
-        match config.mode {
-            Mode::ZeroOverhead => (),
-            Mode::Interrupt => {
-                peri.enable_irq();
-            }
-            Mode::Dma => {
-                peri.enable_write_dma();
-                peri.enable_read_dma();
-            }
-        }
-
-        //self.state.input_buf.fill(0u32);
-    }
-
-    /// Run a CORDIC calculation
-    pub fn calc_32bit(&mut self, arg1s: &[f64], arg2s: Option<&[f64]>, output: &mut [f64]) -> usize {
-        match self.config.mode {
-            Mode::ZeroOverhead => {
-                if arg2s.is_none() {
-                    self.cordic.set_argument_count(Count::One);
-
-                    self.cordic.set_result_count(if self.config.first_result {
-                        if output.len() < arg1s.len() {
-                            panic!("Output buf length is not long enough")
-                        }
-                        Count::One
-                    } else {
-                        if output.len() < 2 * arg1s.len() {
-                            panic!("Output buf length is not long enough")
-                        }
-                        Count::Two
-                    });
-
-                    let mut cnt = 0;
-
-                    for &arg in arg1s.iter() {
-                        self.cordic.write_argument(f64_to_q1_31(arg));
-                        output[cnt] = q1_31_to_f64(self.cordic.read_result());
-                        cnt += 1;
-                    }
-
-                    cnt
-                } else {
-                    todo!()
-                }
-            }
-            Mode::Interrupt => todo!(),
-            Mode::Dma => todo!(),
-        }
-    }
-}
-
-impl<'d, T: Instance> Drop for Cordic<'d, T> {
-    fn drop(&mut self) {
-        T::disable();
-    }
-}
-
-foreach_interrupt!(
-    ($inst:ident, cordic, CORDIC, GLOBAL, $irq:ident) => {
-        impl Instance for peripherals::$inst {
-        }
-
-        impl sealed::Instance for peripherals::$inst {
-            fn regs() -> crate::pac::cordic::Cordic {
-                crate::pac::$inst
-            }
-        }
-    };
-);
-
-macro_rules! floating_fixed_convert {
-    ($f_to_q:ident, $q_to_f:ident, $unsigned_bin_typ:ty, $signed_bin_typ:ty, $float_ty:ty, $offset:literal, $min_positive:literal) => {
-        /// convert float point to fixed point format
-        pub fn $f_to_q(value: $float_ty) -> $unsigned_bin_typ {
-            const MIN_POSITIVE: $float_ty = unsafe { core::mem::transmute($min_positive) };
-
-            assert!(
-                (-1.0 as $float_ty) <= value,
-                "input value {} should be equal or greater than -1",
-                value
-            );
-
-            let value = if value == 1.0 as $float_ty{
-                (1.0 as $float_ty) - MIN_POSITIVE
-            } else {
-                assert!(
-                    value <= (1.0 as $float_ty) - MIN_POSITIVE,
-                    "input value {} should be equal or less than 1-2^(-{})",
-                    value, $offset
-                );
-                value
-            };
-
-            (value * ((1 as $unsigned_bin_typ << $offset) as $float_ty)) as $unsigned_bin_typ
-        }
-
-        #[inline(always)]
-        /// convert fixed point to float point format
-        pub fn $q_to_f(value: $unsigned_bin_typ) -> $float_ty {
-            // It's needed to convert from unsigned to signed first, for correct result.
-            -(value as $signed_bin_typ as $float_ty) / ((1 as $unsigned_bin_typ << $offset) as $float_ty)
-        }
-    };
-}
-
-floating_fixed_convert!(
-    f64_to_q1_31,
-    q1_31_to_f64,
-    u32,
-    i32,
-    f64,
-    31,
-    0x3E00_0000_0000_0000u64 // binary form of 1f64^(-31)
-);
-
-floating_fixed_convert!(
-    f32_to_q1_15,
-    q1_15_to_f32,
-    u16,
-    i16,
-    f32,
-    15,
-    0x3800_0000u32 // binary form of 1f32^(-15)
-);
diff --git a/embassy-stm32/src/cordic/enums.rs b/embassy-stm32/src/cordic/enums.rs
new file mode 100644
index 000000000..4697a1df1
--- /dev/null
+++ b/embassy-stm32/src/cordic/enums.rs
@@ -0,0 +1,82 @@
+/// CORDIC function
+#[allow(missing_docs)]
+#[derive(Clone, Copy)]
+pub enum Function {
+    Cos = 0,
+    Sin,
+    Phase,
+    Modulus,
+    Arctan,
+    Cosh,
+    Sinh,
+    Arctanh,
+    Ln,
+    Sqrt,
+}
+
+/// CORDIC precision
+#[allow(missing_docs)]
+#[derive(Clone, Copy)]
+pub enum Precision {
+    Iters4 = 1,
+    Iters8,
+    Iters12,
+    Iters16,
+    Iters20,
+    Iters24,
+    Iters28,
+    Iters32,
+    Iters36,
+    Iters40,
+    Iters44,
+    Iters48,
+    Iters52,
+    Iters56,
+    Iters60,
+}
+
+/// CORDIC scale
+#[allow(non_camel_case_types)]
+#[allow(missing_docs)]
+#[derive(Clone, Copy, Default)]
+pub enum Scale {
+    #[default]
+    A1_R1 = 0,
+    A1o2_R2,
+    A1o4_R4,
+    A1o8_R8,
+    A1o16_R16,
+    A1o32_R32,
+    A1o64_R64,
+    A1o128_R128,
+}
+
+/// CORDIC argument/result count
+#[allow(missing_docs)]
+#[derive(Clone, Copy, Default)]
+pub enum Count {
+    #[default]
+    One,
+    Two,
+}
+
+/// CORDIC argument/result data width
+#[allow(missing_docs)]
+#[derive(Clone, Copy)]
+pub enum Width {
+    Bits32,
+    Bits16,
+}
+
+/// Cordic driver running mode
+#[derive(Clone, Copy)]
+pub enum Mode {
+    /// After caculation start, a read to RDATA register will block AHB until the caculation finished
+    ZeroOverhead,
+
+    /// Use CORDIC interrupt to trigger a read result value
+    Interrupt,
+
+    /// Use DMA to write/read value
+    Dma,
+}
diff --git a/embassy-stm32/src/cordic/mod.rs b/embassy-stm32/src/cordic/mod.rs
new file mode 100644
index 000000000..c0a69b757
--- /dev/null
+++ b/embassy-stm32/src/cordic/mod.rs
@@ -0,0 +1,206 @@
+//! CORDIC co-processor
+
+use crate::peripherals;
+use embassy_hal_internal::{into_ref, Peripheral, PeripheralRef};
+
+mod enums;
+pub use enums::*;
+
+pub mod utils;
+
+pub(crate) mod sealed;
+
+/// Low-level CORDIC access.
+#[cfg(feature = "unstable-pac")]
+pub mod low_level {
+    pub use super::sealed::*;
+}
+
+/// CORDIC driver
+pub struct Cordic<'d, T: Instance> {
+    cordic: PeripheralRef<'d, T>,
+    config: Config,
+    //state: State,
+}
+
+/// CORDIC instance trait
+pub trait Instance: sealed::Instance + Peripheral<P = Self> + crate::rcc::RccPeripheral {}
+
+/// CORDIC configuration
+pub struct Config {
+    function: Function,
+    precision: Precision,
+    scale: Scale,
+    mode: Mode,
+    first_result: bool,
+}
+
+// CORDIC running state
+//struct State {
+//    input_buf: [u32; 8],
+//    buf_len: usize,
+//}
+
+impl Config {
+    /// Create a config for Cordic driver
+    pub fn new(function: Function, precision: Precision, scale: Option<Scale>, mode: Mode, first_result: bool) -> Self {
+        Self {
+            function,
+            precision,
+            scale: scale.unwrap_or_default(),
+            mode,
+            first_result,
+        }
+    }
+
+    fn check_scale(&self) -> bool {
+        let scale_raw = self.scale as u8;
+
+        match self.function {
+            Function::Cos | Function::Sin | Function::Phase | Function::Modulus => 0 == scale_raw,
+            Function::Arctan => (0..=7).contains(&scale_raw),
+            Function::Cosh | Function::Sinh | Function::Arctanh => 1 == scale_raw,
+            Function::Ln => (1..=4).contains(&scale_raw),
+            Function::Sqrt => (0..=2).contains(&scale_raw),
+        }
+    }
+}
+
+impl<'d, T: Instance> Cordic<'d, T> {
+    /// Create a Cordic driver instance
+    ///
+    /// Note:  
+    /// If you need a periperhal -> CORDIC -> peripehral mode,  
+    /// you may want to set Cordic into [Mode::ZeroOverhead] mode, and add extra arguemnts with [Self::extra_config]
+    pub fn new(cordic: impl Peripheral<P = T> + 'd, config: Config) -> Self {
+        T::enable_and_reset();
+
+        into_ref!(cordic);
+
+        if !config.check_scale() {
+            panic!("Scale value is not compatible with Function")
+        }
+
+        let mut instance = Self {
+            cordic,
+            config,
+            // state: State {
+            //     input_buf: [0u32; 8],
+            //     buf_len: 0,
+            // },
+        };
+
+        instance.reconfigure();
+
+        instance
+    }
+
+    /// Set a new config for Cordic driver  
+    pub fn set_config(&mut self, config: Config) {
+        self.config = config;
+        self.reconfigure();
+    }
+
+    /// Set extra config for data count and data width.
+    pub fn extra_config(&mut self, arg_cnt: Count, arg_width: Width, res_width: Width) {
+        let peri = &self.cordic;
+        peri.set_argument_count(arg_cnt);
+        peri.set_data_width(arg_width, res_width);
+    }
+
+    fn reconfigure(&mut self) {
+        let peri = &self.cordic;
+        let config = &self.config;
+
+        if peri.ready_to_read() {
+            warn!("At least 1 result hasn't been read, reconfigure will cause DATA LOST");
+        };
+
+        peri.disable_irq();
+        peri.disable_write_dma();
+        peri.disable_read_dma();
+
+        // clean RRDY flag
+        while peri.ready_to_read() {
+            peri.read_result();
+        }
+
+        peri.set_func(config.function);
+        peri.set_precision(config.precision);
+        peri.set_scale(config.scale);
+        if config.first_result {
+            peri.set_result_count(Count::One)
+        } else {
+            peri.set_result_count(Count::Two)
+        }
+
+        match config.mode {
+            Mode::ZeroOverhead => (),
+            Mode::Interrupt => {
+                peri.enable_irq();
+            }
+            Mode::Dma => {
+                peri.enable_write_dma();
+                peri.enable_read_dma();
+            }
+        }
+
+        //self.state.input_buf.fill(0u32);
+    }
+
+    /// Run a CORDIC calculation
+    pub fn calc_32bit(&mut self, arg1s: &[f64], arg2s: Option<&[f64]>, output: &mut [f64]) -> usize {
+        match self.config.mode {
+            Mode::ZeroOverhead => {
+                if arg2s.is_none() {
+                    self.cordic.set_argument_count(Count::One);
+
+                    self.cordic.set_result_count(if self.config.first_result {
+                        if output.len() < arg1s.len() {
+                            panic!("Output buf length is not long enough")
+                        }
+                        Count::One
+                    } else {
+                        if output.len() < 2 * arg1s.len() {
+                            panic!("Output buf length is not long enough")
+                        }
+                        Count::Two
+                    });
+
+                    let mut cnt = 0;
+
+                    for &arg in arg1s.iter() {
+                        self.cordic.write_argument(utils::f64_to_q1_31(arg));
+                        output[cnt] = utils::q1_31_to_f64(self.cordic.read_result());
+                        cnt += 1;
+                    }
+
+                    cnt
+                } else {
+                    todo!()
+                }
+            }
+            Mode::Interrupt => todo!(),
+            Mode::Dma => todo!(),
+        }
+    }
+}
+
+impl<'d, T: Instance> Drop for Cordic<'d, T> {
+    fn drop(&mut self) {
+        T::disable();
+    }
+}
+
+foreach_interrupt!(
+    ($inst:ident, cordic, CORDIC, GLOBAL, $irq:ident) => {
+        impl Instance for peripherals::$inst {
+        }
+
+        impl sealed::Instance for peripherals::$inst {
+            fn regs() -> crate::pac::cordic::Cordic {
+                crate::pac::$inst
+            }
+        }
+    };
+);
diff --git a/embassy-stm32/src/cordic/sealed.rs b/embassy-stm32/src/cordic/sealed.rs
new file mode 100644
index 000000000..0f00e380c
--- /dev/null
+++ b/embassy-stm32/src/cordic/sealed.rs
@@ -0,0 +1,116 @@
+use super::*;
+use crate::pac::cordic::vals;
+
+/// Cordic instance
+pub trait Instance {
+    /// Get access to CORDIC registers
+    fn regs() -> crate::pac::cordic::Cordic;
+
+    /// Set Function value
+    fn set_func(&self, func: Function) {
+        Self::regs()
+            .csr()
+            .modify(|v| v.set_func(vals::Func::from_bits(func as u8)));
+    }
+
+    /// Set Precision value
+    fn set_precision(&self, precision: Precision) {
+        Self::regs()
+            .csr()
+            .modify(|v| v.set_precision(vals::Precision::from_bits(precision as u8)))
+    }
+
+    /// Set Scale value
+    fn set_scale(&self, scale: Scale) {
+        Self::regs()
+            .csr()
+            .modify(|v| v.set_scale(vals::Scale::from_bits(scale as u8)))
+    }
+
+    /// Enable global interrupt
+    fn enable_irq(&self) {
+        Self::regs().csr().modify(|v| v.set_ien(true))
+    }
+
+    /// Disable global interrupt
+    fn disable_irq(&self) {
+        Self::regs().csr().modify(|v| v.set_ien(false))
+    }
+
+    /// Enable Read DMA
+    fn enable_read_dma(&self) {
+        Self::regs().csr().modify(|v| {
+            v.set_dmaren(true);
+        })
+    }
+
+    /// Disable Read DMA
+    fn disable_read_dma(&self) {
+        Self::regs().csr().modify(|v| {
+            v.set_dmaren(false);
+        })
+    }
+
+    /// Enable Write DMA
+    fn enable_write_dma(&self) {
+        Self::regs().csr().modify(|v| {
+            v.set_dmawen(true);
+        })
+    }
+
+    /// Disable Write DMA
+    fn disable_write_dma(&self) {
+        Self::regs().csr().modify(|v| {
+            v.set_dmawen(false);
+        })
+    }
+
+    /// Set NARGS value
+    fn set_argument_count(&self, n: Count) {
+        Self::regs().csr().modify(|v| {
+            v.set_nargs(match n {
+                Count::One => vals::Num::NUM1,
+                Count::Two => vals::Num::NUM2,
+            })
+        })
+    }
+
+    /// Set NRES value
+    fn set_result_count(&self, n: Count) {
+        Self::regs().csr().modify(|v| {
+            v.set_nres(match n {
+                Count::One => vals::Num::NUM1,
+                Count::Two => vals::Num::NUM2,
+            });
+        })
+    }
+
+    /// Set ARGSIZE and RESSIZE value
+    fn set_data_width(&self, arg: Width, res: Width) {
+        Self::regs().csr().modify(|v| {
+            v.set_argsize(match arg {
+                Width::Bits32 => vals::Size::BITS32,
+                Width::Bits16 => vals::Size::BITS16,
+            });
+            v.set_ressize(match res {
+                Width::Bits32 => vals::Size::BITS32,
+                Width::Bits16 => vals::Size::BITS16,
+            })
+        })
+    }
+
+    /// Read RRDY flag
+    fn ready_to_read(&self) -> bool {
+        Self::regs().csr().read().rrdy()
+    }
+
+    /// Write value to WDATA
+    fn write_argument(&self, arg: u32) {
+        Self::regs().wdata().write_value(arg)
+    }
+
+    /// Read value from RDATA
+    fn read_result(&self) -> u32 {
+        Self::regs().rdata().read()
+    }
+}
diff --git a/embassy-stm32/src/cordic/utils.rs b/embassy-stm32/src/cordic/utils.rs
new file mode 100644
index 000000000..3f055c34b
--- /dev/null
+++ b/embassy-stm32/src/cordic/utils.rs
@@ -0,0 +1,59 @@
+//! Common match utils
+
+macro_rules! floating_fixed_convert {
+    ($f_to_q:ident, $q_to_f:ident, $unsigned_bin_typ:ty, $signed_bin_typ:ty, $float_ty:ty, $offset:literal, $min_positive:literal) => {
+        /// convert float point to fixed point format
+        pub fn $f_to_q(value: $float_ty) -> $unsigned_bin_typ {
+            const MIN_POSITIVE: $float_ty = unsafe { core::mem::transmute($min_positive) };
+
+            assert!(
+                (-1.0 as $float_ty) <= value,
+                "input value {} should be equal or greater than -1",
+                value
+            );
+
+
+            let value = if value == 1.0 as $float_ty{
+                // make a exception for user specifing exact 1.0 float point,
+                // convert 1.0 to max representable value of q1.x format
+                (1.0 as $float_ty) - MIN_POSITIVE
+            } else {
+                assert!(
+                    value <= (1.0 as $float_ty) - MIN_POSITIVE,
+                    "input value {} should be equal or less than 1-2^(-{})",
+                    value, $offset
+                );
+                value
+            };
+
+            (value * ((1 as $unsigned_bin_typ << $offset) as $float_ty)) as $unsigned_bin_typ
+        }
+
+        #[inline(always)]
+        /// convert fixed point to float point format
+        pub fn $q_to_f(value: $unsigned_bin_typ) -> $float_ty {
+            // It's needed to convert from unsigned to signed first, for correct result.
+            -(value as $signed_bin_typ as $float_ty) / ((1 as $unsigned_bin_typ << $offset) as $float_ty)
+        }
+    };
+}
+
+floating_fixed_convert!(
+    f64_to_q1_31,
+    q1_31_to_f64,
+    u32,
+    i32,
+    f64,
+    31,
+    0x3E00_0000_0000_0000u64 // binary form of 1f64^(-31)
+);
+
+floating_fixed_convert!(
+    f32_to_q1_15,
+    q1_15_to_f32,
+    u16,
+    i16,
+    f32,
+    15,
+    0x3800_0000u32 // binary form of 1f32^(-15)
+);

From a1ca9088b4e3b4644428eab80e8502a55b2cbe8f Mon Sep 17 00:00:00 2001
From: eZio Pan <eziopan@qq.com>
Date: Fri, 15 Mar 2024 19:34:55 +0800
Subject: [PATCH 03/17] stm32 CORDIC: ZeroOverhead q1.31 mode

---
 embassy-stm32/src/cordic/enums.rs |   7 +-
 embassy-stm32/src/cordic/mod.rs   | 290 ++++++++++++++++++++++++------
 2 files changed, 241 insertions(+), 56 deletions(-)

diff --git a/embassy-stm32/src/cordic/enums.rs b/embassy-stm32/src/cordic/enums.rs
index 4697a1df1..3e1c47f7f 100644
--- a/embassy-stm32/src/cordic/enums.rs
+++ b/embassy-stm32/src/cordic/enums.rs
@@ -16,14 +16,15 @@ pub enum Function {
 
 /// CORDIC precision
 #[allow(missing_docs)]
-#[derive(Clone, Copy)]
+#[derive(Clone, Copy, Default)]
 pub enum Precision {
     Iters4 = 1,
     Iters8,
     Iters12,
     Iters16,
     Iters20,
-    Iters24,
+    #[default]
+    Iters24, // this value is recomended by Reference Manual
     Iters28,
     Iters32,
     Iters36,
@@ -38,7 +39,7 @@ pub enum Precision {
 /// CORDIC scale
 #[allow(non_camel_case_types)]
 #[allow(missing_docs)]
-#[derive(Clone, Copy, Default)]
+#[derive(Clone, Copy, Default, PartialEq)]
 pub enum Scale {
     #[default]
     A1_R1 = 0,
diff --git a/embassy-stm32/src/cordic/mod.rs b/embassy-stm32/src/cordic/mod.rs
index c0a69b757..b15521ca6 100644
--- a/embassy-stm32/src/cordic/mod.rs
+++ b/embassy-stm32/src/cordic/mod.rs
@@ -10,6 +10,10 @@ pub mod utils;
 
 pub(crate) mod sealed;
 
+// length of pre-allocated [u32] memory for CORDIC input,
+// length should be multiple of 2
+const INPUT_BUF_LEN: usize = 8;
+
 /// Low-level CORDIC access.
 #[cfg(feature = "unstable-pac")]
 pub mod low_level {
@@ -20,7 +24,7 @@ pub mod low_level {
 pub struct Cordic<'d, T: Instance> {
     cordic: PeripheralRef<'d, T>,
     config: Config,
-    //state: State,
+    state: State,
 }
 
 /// CORDIC instance trait
@@ -28,27 +32,33 @@ pub trait Instance: sealed::Instance + Peripheral<P = Self> + crate::rcc::RccPer
 
 /// CORDIC configuration
 pub struct Config {
+    mode: Mode,
     function: Function,
     precision: Precision,
     scale: Scale,
-    mode: Mode,
     first_result: bool,
 }
 
 // CORDIC running state
-//struct State {
-//    input_buf: [u32; 8],
-//    buf_len: usize,
-//}
+struct State {
+    input_buf: [u32; INPUT_BUF_LEN],
+    buf_index: usize,
+}
 
 impl Config {
     /// Create a config for Cordic driver
-    pub fn new(function: Function, precision: Precision, scale: Option<Scale>, mode: Mode, first_result: bool) -> Self {
+    pub fn new(
+        mode: Mode,
+        function: Function,
+        precision: Option<Precision>,
+        scale: Option<Scale>,
+        first_result: bool,
+    ) -> Self {
         Self {
-            function,
-            precision,
-            scale: scale.unwrap_or_default(),
             mode,
+            function,
+            precision: precision.unwrap_or_default(),
+            scale: scale.unwrap_or_default(),
             first_result,
         }
     }
@@ -66,6 +76,7 @@ impl Config {
     }
 }
 
+// common method
 impl<'d, T: Instance> Cordic<'d, T> {
     /// Create a Cordic driver instance
     ///
@@ -84,10 +95,10 @@ impl<'d, T: Instance> Cordic<'d, T> {
         let mut instance = Self {
             cordic,
             config,
-            // state: State {
-            //     input_buf: [0u32; 8],
-            //     buf_len: 0,
-            // },
+            state: State {
+                input_buf: [0u32; 8],
+                buf_index: 0,
+            },
         };
 
         instance.reconfigure();
@@ -128,6 +139,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
         peri.set_func(config.function);
         peri.set_precision(config.precision);
         peri.set_scale(config.scale);
+
         if config.first_result {
             peri.set_result_count(Count::One)
         } else {
@@ -145,44 +157,8 @@ impl<'d, T: Instance> Cordic<'d, T> {
             }
         }
 
-        //self.state.input_buf.fill(0u32);
-    }
-
-    /// Run a CORDIC calculation
-    pub fn calc_32bit(&mut self, arg1s: &[f64], arg2s: Option<&[f64]>, output: &mut [f64]) -> usize {
-        match self.config.mode {
-            Mode::ZeroOverhead => {
-                if arg2s.is_none() {
-                    self.cordic.set_argument_count(Count::One);
-
-                    self.cordic.set_result_count(if self.config.first_result {
-                        if output.len() < arg1s.len() {
-                            panic!("Output buf length is not long enough")
-                        }
-                        Count::One
-                    } else {
-                        if output.len() < 2 * arg1s.len() {
-                            panic!("Output buf length is not long enough")
-                        }
-                        Count::Two
-                    });
-
-                    let mut cnt = 0;
-
-                    for &arg in arg1s.iter() {
-                        self.cordic.write_argument(utils::f64_to_q1_31(arg));
-                        output[cnt] = utils::q1_31_to_f64(self.cordic.read_result());
-                        cnt += 1;
-                    }
-
-                    cnt
-                } else {
-                    todo!()
-                }
-            }
-            Mode::Interrupt => todo!(),
-            Mode::Dma => todo!(),
-        }
+        self.state.input_buf.fill(0u32);
+        self.state.buf_index = 0;
     }
 }
 
@@ -192,8 +168,216 @@ impl<'d, T: Instance> Drop for Cordic<'d, T> {
     }
 }
 
+// q1.31 related
+impl<'d, T: Instance> Cordic<'d, T> {
+    /// Run a CORDIC calculation
+    pub fn calc_32bit(&mut self, arg1s: &[f64], arg2s: Option<&[f64]>, output: &mut [f64]) -> usize {
+        let peri = &self.cordic;
+        let config = &self.config;
+
+        assert!(
+            match config.first_result {
+                true => output.len() >= arg1s.len(),
+                false => output.len() >= 2 * arg1s.len(),
+            },
+            "Output buf length is not long enough"
+        );
+
+        self.check_input_f64(arg1s, arg2s);
+
+        peri.set_result_count(if config.first_result { Count::One } else { Count::Two });
+        peri.set_data_width(Width::Bits32, Width::Bits32);
+
+        let state = &mut self.state;
+
+        let mut output_count = 0;
+
+        let mut consumed_input_len = 0;
+
+        match config.mode {
+            Mode::ZeroOverhead => {
+                // put double input into cordic
+                if arg2s.is_some() && !arg2s.unwrap().is_empty() {
+                    let arg2s = arg2s.unwrap();
+
+                    peri.set_argument_count(Count::Two);
+
+                    let double_value = arg1s.iter().zip(arg2s);
+                    consumed_input_len = double_value.len();
+
+                    for (arg1, arg2) in double_value {
+                        // if input_buf is full, send values to cordic
+                        if state.buf_index == INPUT_BUF_LEN - 1 {
+                            for arg in state.input_buf.chunks(2) {
+                                peri.write_argument(arg[0]);
+                                peri.write_argument(arg[1]);
+
+                                output[output_count] = utils::q1_31_to_f64(peri.read_result());
+                                output_count += 1;
+
+                                if !config.first_result {
+                                    output[output_count] = utils::q1_31_to_f64(peri.read_result());
+                                    output_count += 1;
+                                }
+                            }
+
+                            state.buf_index = 0;
+                        }
+
+                        for &&arg in [arg1, arg2].iter() {
+                            state.input_buf[state.buf_index] = utils::f64_to_q1_31(arg);
+                            state.buf_index += 1;
+                        }
+                    }
+
+                    // put left paired args into cordic
+                    if state.buf_index > 0 {
+                        for arg in state.input_buf[..state.buf_index].chunks(2) {
+                            peri.write_argument(arg[0]);
+                            peri.write_argument(arg[1]);
+
+                            output[output_count] = utils::q1_31_to_f64(peri.read_result());
+                            output_count += 1;
+
+                            if !config.first_result {
+                                output[output_count] = utils::q1_31_to_f64(peri.read_result());
+                                output_count += 1;
+                            }
+                        }
+
+                        state.buf_index = 0;
+                    }
+                }
+
+                // put single input into cordic
+                let input_left = &arg1s[consumed_input_len..];
+
+                if !input_left.is_empty() {
+                    peri.set_argument_count(Count::One);
+
+                    for &arg in input_left.iter() {
+                        peri.write_argument(utils::f64_to_q1_31(arg));
+
+                        output[output_count] = utils::q1_31_to_f64(peri.read_result());
+                        output_count += 1;
+
+                        if !config.first_result {
+                            output[output_count] = utils::q1_31_to_f64(peri.read_result());
+                            output_count += 1;
+                        }
+                    }
+                }
+
+                output_count
+            }
+            Mode::Interrupt => todo!(),
+            Mode::Dma => todo!(),
+        }
+    }
+
+    fn check_input_f64(&self, arg1s: &[f64], arg2s: Option<&[f64]>) {
+        let config = &self.config;
+
+        use Function::*;
+
+        // check SCALE value
+        match config.function {
+            Cos | Sin | Phase | Modulus => assert!(Scale::A1_R1 == config.scale, "SCALE should be 0"),
+            Arctan => assert!(
+                (0..=7).contains(&(config.scale as u8)),
+                "SCALE should be: 0 <= SCALE <= 7"
+            ),
+            Cosh | Sinh | Arctanh => assert!(Scale::A1o2_R2 == config.scale, "SCALE should be 1"),
+
+            Ln => assert!(
+                (1..=4).contains(&(config.scale as u8)),
+                "SCALE should be: 1 <= SCALE <= 4"
+            ),
+            Sqrt => assert!(
+                (0..=2).contains(&(config.scale as u8)),
+                "SCALE should be: 0 <= SCALE <= 2"
+            ),
+        }
+
+        // check ARG1 value
+        match config.function {
+            Cos | Sin | Phase | Modulus | Arctan => {
+                assert!(
+                    arg1s.iter().all(|v| (-1.0..=1.0).contains(v)),
+                    "ARG1 should be: -1 <= ARG1 <= 1"
+                );
+            }
+
+            Cosh | Sinh => assert!(
+                arg1s.iter().all(|v| (-0.559..=0.559).contains(v)),
+                "ARG1 should be: -0.559 <= ARG1 <= 0.559"
+            ),
+
+            Arctanh => assert!(
+                arg1s.iter().all(|v| (-0.403..=0.403).contains(v)),
+                "ARG1 should be: -0.403 <= ARG1 <= 0.403"
+            ),
+
+            Ln => {
+                match config.scale {
+                    Scale::A1o2_R2 => assert!(
+                        arg1s.iter().all(|v| (0.05354..0.5).contains(v)),
+                        "When SCALE set to 1, ARG1 should be: 0.05354 <= ARG1 < 0.5"
+                    ),
+                    Scale::A1o4_R4 => assert!(
+                        arg1s.iter().all(|v| (0.25..0.75).contains(v)),
+                        "When SCALE set to 2, ARG1 should be: 0.25 <= ARG1 < 0.75"
+                    ),
+                    Scale::A1o8_R8 => assert!(
+                        arg1s.iter().all(|v| (0.375..0.875).contains(v)),
+                        "When SCALE set to 3, ARG1 should be: 0.375 <= ARG1 < 0.875"
+                    ),
+                    Scale::A1o16_R16 => assert!(
+                        arg1s.iter().all(|v| (0.4375f64..0.584f64).contains(v)),
+                        "When SCALE set to 4, ARG1 should be: 0.4375 <= ARG1 < 0.584"
+                    ),
+                    _ => unreachable!(),
+                };
+            }
+
+            Function::Sqrt => match config.scale {
+                Scale::A1_R1 => assert!(
+                    arg1s.iter().all(|v| (0.027..0.75).contains(v)),
+                    "When SCALE set to 0, ARG1 should be: 0.027 <= ARG1 < 0.75"
+                ),
+                Scale::A1o2_R2 => assert!(
+                    arg1s.iter().all(|v| (0.375..0.875).contains(v)),
+                    "When SCALE set to 1, ARG1 should be: 0.375 <= ARG1 < 0.875"
+                ),
+                Scale::A1o4_R4 => assert!(
+                    arg1s.iter().all(|v| (0.4375..0.585).contains(v)),
+                    "When SCALE set to 2, ARG1 should be: 0.4375  <= ARG1 < 0.585"
+                ),
+                _ => unreachable!(),
+            },
+        }
+
+        // check ARG2 value
+        if let Some(arg2s) = arg2s {
+            match config.function {
+                Cos | Sin => assert!(
+                    arg2s.iter().all(|v| (0.0..=1.0).contains(v)),
+                    "ARG2 should be: 0 <= ARG2 <= 1"
+                ),
+
+                Phase | Modulus => assert!(
+                    arg2s.iter().all(|v| (-1.0..=1.0).contains(v)),
+                    "ARG2 should be: -1 <= ARG2 <= 1"
+                ),
+
+                _ => (),
+            }
+        }
+    }
+}
+
 foreach_interrupt!(
-    ($inst:ident, cordic, CORDIC, GLOBAL, $irq:ident) => {
+    ($inst:ident, cordic, $block:ident, GLOBAL, $irq:ident) => {
         impl Instance for peripherals::$inst {
         }
 

From 5d12f594303bdb76bf2356d9fc0661826e2e658e Mon Sep 17 00:00:00 2001
From: eZio Pan <eziopan@qq.com>
Date: Sat, 16 Mar 2024 00:25:38 +0800
Subject: [PATCH 04/17] stm32 CORDIC: make use of "preload" feature

---
 embassy-stm32/src/cordic/mod.rs | 180 +++++++++++++++-----------------
 1 file changed, 85 insertions(+), 95 deletions(-)

diff --git a/embassy-stm32/src/cordic/mod.rs b/embassy-stm32/src/cordic/mod.rs
index b15521ca6..997ace113 100644
--- a/embassy-stm32/src/cordic/mod.rs
+++ b/embassy-stm32/src/cordic/mod.rs
@@ -22,9 +22,8 @@ pub mod low_level {
 
 /// CORDIC driver
 pub struct Cordic<'d, T: Instance> {
-    cordic: PeripheralRef<'d, T>,
+    peri: PeripheralRef<'d, T>,
     config: Config,
-    state: State,
 }
 
 /// CORDIC instance trait
@@ -83,23 +82,16 @@ impl<'d, T: Instance> Cordic<'d, T> {
     /// Note:  
     /// If you need a periperhal -> CORDIC -> peripehral mode,  
     /// you may want to set Cordic into [Mode::ZeroOverhead] mode, and add extra arguemnts with [Self::extra_config]
-    pub fn new(cordic: impl Peripheral<P = T> + 'd, config: Config) -> Self {
+    pub fn new(peri: impl Peripheral<P = T> + 'd, config: Config) -> Self {
         T::enable_and_reset();
 
-        into_ref!(cordic);
+        into_ref!(peri);
 
         if !config.check_scale() {
             panic!("Scale value is not compatible with Function")
         }
 
-        let mut instance = Self {
-            cordic,
-            config,
-            state: State {
-                input_buf: [0u32; 8],
-                buf_index: 0,
-            },
-        };
+        let mut instance = Self { peri, config };
 
         instance.reconfigure();
 
@@ -114,51 +106,71 @@ impl<'d, T: Instance> Cordic<'d, T> {
 
     /// Set extra config for data count and data width.
     pub fn extra_config(&mut self, arg_cnt: Count, arg_width: Width, res_width: Width) {
-        let peri = &self.cordic;
-        peri.set_argument_count(arg_cnt);
-        peri.set_data_width(arg_width, res_width);
+        self.peri.set_argument_count(arg_cnt);
+        self.peri.set_data_width(arg_width, res_width);
     }
 
     fn reconfigure(&mut self) {
-        let peri = &self.cordic;
-        let config = &self.config;
-
-        if peri.ready_to_read() {
+        if self.peri.ready_to_read() {
             warn!("At least 1 result hasn't been read, reconfigure will cause DATA LOST");
         };
 
-        peri.disable_irq();
-        peri.disable_write_dma();
-        peri.disable_read_dma();
+        self.peri.disable_irq();
+        self.peri.disable_write_dma();
+        self.peri.disable_read_dma();
 
         // clean RRDY flag
-        while peri.ready_to_read() {
-            peri.read_result();
+        while self.peri.ready_to_read() {
+            self.peri.read_result();
         }
 
-        peri.set_func(config.function);
-        peri.set_precision(config.precision);
-        peri.set_scale(config.scale);
+        self.peri.set_func(self.config.function);
+        self.peri.set_precision(self.config.precision);
+        self.peri.set_scale(self.config.scale);
 
-        if config.first_result {
-            peri.set_result_count(Count::One)
+        if self.config.first_result {
+            self.peri.set_result_count(Count::One)
         } else {
-            peri.set_result_count(Count::Two)
+            self.peri.set_result_count(Count::Two)
         }
 
-        match config.mode {
+        match self.config.mode {
             Mode::ZeroOverhead => (),
             Mode::Interrupt => {
-                peri.enable_irq();
+                self.peri.enable_irq();
             }
             Mode::Dma => {
-                peri.enable_write_dma();
-                peri.enable_read_dma();
+                self.peri.enable_write_dma();
+                self.peri.enable_read_dma();
             }
         }
+    }
 
-        self.state.input_buf.fill(0u32);
-        self.state.buf_index = 0;
+    fn blocking_read_f64(&mut self) -> (f64, Option<f64>) {
+        let res1 = utils::q1_31_to_f64(self.peri.read_result());
+
+        let res2 = if !self.config.first_result {
+            Some(utils::q1_31_to_f64(self.peri.read_result()))
+        } else {
+            None
+        };
+
+        (res1, res2)
+    }
+
+    fn blocking_read_f64_to_buf(&mut self, result_buf: &mut [f64], result_index: &mut usize) {
+        let (res1, res2) = self.blocking_read_f64();
+        result_buf[*result_index] = res1;
+        *result_index += 1;
+
+        if let Some(res2) = res2 {
+            result_buf[*result_index] = res2;
+            *result_index += 1;
+        }
+    }
+
+    fn blocking_write_f64(&mut self, arg: f64) {
+        self.peri.write_argument(utils::f64_to_q1_31(arg));
     }
 }
 
@@ -172,11 +184,8 @@ impl<'d, T: Instance> Drop for Cordic<'d, T> {
 impl<'d, T: Instance> Cordic<'d, T> {
     /// Run a CORDIC calculation
     pub fn calc_32bit(&mut self, arg1s: &[f64], arg2s: Option<&[f64]>, output: &mut [f64]) -> usize {
-        let peri = &self.cordic;
-        let config = &self.config;
-
         assert!(
-            match config.first_result {
+            match self.config.first_result {
                 true => output.len() >= arg1s.len(),
                 false => output.len() >= 2 * arg1s.len(),
             },
@@ -185,87 +194,68 @@ impl<'d, T: Instance> Cordic<'d, T> {
 
         self.check_input_f64(arg1s, arg2s);
 
-        peri.set_result_count(if config.first_result { Count::One } else { Count::Two });
-        peri.set_data_width(Width::Bits32, Width::Bits32);
+        self.peri.set_result_count(if self.config.first_result {
+            Count::One
+        } else {
+            Count::Two
+        });
 
-        let state = &mut self.state;
+        self.peri.set_data_width(Width::Bits32, Width::Bits32);
 
         let mut output_count = 0;
 
         let mut consumed_input_len = 0;
 
-        match config.mode {
+        match self.config.mode {
             Mode::ZeroOverhead => {
                 // put double input into cordic
                 if arg2s.is_some() && !arg2s.unwrap().is_empty() {
                     let arg2s = arg2s.unwrap();
 
-                    peri.set_argument_count(Count::Two);
+                    self.peri.set_argument_count(Count::Two);
 
-                    let double_value = arg1s.iter().zip(arg2s);
-                    consumed_input_len = double_value.len();
+                    // Skip 1st value from arg1s, this value will be manually "preload" to cordic, to make use of cordic preload function.
+                    // And we preserve last value from arg2s, since it need to manually write to cordic, and read the result out.
+                    let double_input = arg1s.iter().skip(1).zip(&arg2s[..arg2s.len() - 1]);
+                    // Since we preload 1st value from arg1s, the consumed input length is double_input length + 1.
+                    consumed_input_len = double_input.len() + 1;
 
-                    for (arg1, arg2) in double_value {
-                        // if input_buf is full, send values to cordic
-                        if state.buf_index == INPUT_BUF_LEN - 1 {
-                            for arg in state.input_buf.chunks(2) {
-                                peri.write_argument(arg[0]);
-                                peri.write_argument(arg[1]);
+                    // preload first value from arg1 to cordic
+                    self.blocking_write_f64(arg1s[0]);
 
-                                output[output_count] = utils::q1_31_to_f64(peri.read_result());
-                                output_count += 1;
+                    for (&arg1, &arg2) in double_input {
+                        // Since we manually preload a value before,
+                        // we will write arg2 (from the actual last pair) first, (at this moment, cordic start to calculating,)
+                        // and write arg1 (from the actual next pair), then read the result, to "keep preloading"
 
-                                if !config.first_result {
-                                    output[output_count] = utils::q1_31_to_f64(peri.read_result());
-                                    output_count += 1;
-                                }
-                            }
-
-                            state.buf_index = 0;
-                        }
-
-                        for &&arg in [arg1, arg2].iter() {
-                            state.input_buf[state.buf_index] = utils::f64_to_q1_31(arg);
-                            state.buf_index += 1;
-                        }
+                        self.blocking_write_f64(arg2);
+                        self.blocking_write_f64(arg1);
+                        self.blocking_read_f64_to_buf(output, &mut output_count);
                     }
 
-                    // put left paired args into cordic
-                    if state.buf_index > 0 {
-                        for arg in state.input_buf[..state.buf_index].chunks(2) {
-                            peri.write_argument(arg[0]);
-                            peri.write_argument(arg[1]);
-
-                            output[output_count] = utils::q1_31_to_f64(peri.read_result());
-                            output_count += 1;
-
-                            if !config.first_result {
-                                output[output_count] = utils::q1_31_to_f64(peri.read_result());
-                                output_count += 1;
-                            }
-                        }
-
-                        state.buf_index = 0;
-                    }
+                    // write last input value from arg2s, then read out the result
+                    self.blocking_write_f64(arg2s[arg2s.len() - 1]);
+                    self.blocking_read_f64_to_buf(output, &mut output_count);
                 }
 
                 // put single input into cordic
                 let input_left = &arg1s[consumed_input_len..];
 
                 if !input_left.is_empty() {
-                    peri.set_argument_count(Count::One);
+                    self.peri.set_argument_count(Count::One);
 
-                    for &arg in input_left.iter() {
-                        peri.write_argument(utils::f64_to_q1_31(arg));
+                    // "preload" value to cordic (at this moment, cordic start to calculating)
+                    self.blocking_write_f64(input_left[0]);
 
-                        output[output_count] = utils::q1_31_to_f64(peri.read_result());
-                        output_count += 1;
-
-                        if !config.first_result {
-                            output[output_count] = utils::q1_31_to_f64(peri.read_result());
-                            output_count += 1;
-                        }
+                    for &arg in input_left.iter().skip(1) {
+                        // this line write arg for next round caculation to cordic,
+                        // and read result from last round
+                        self.blocking_write_f64(arg);
+                        self.blocking_read_f64_to_buf(output, &mut output_count);
                     }
+
+                    // read the last output
+                    self.blocking_read_f64_to_buf(output, &mut output_count);
                 }
 
                 output_count

From c9f759bb21782eb0487c96a59500310d1283694c Mon Sep 17 00:00:00 2001
From: eZio Pan <eziopan@qq.com>
Date: Sat, 16 Mar 2024 21:20:17 +0800
Subject: [PATCH 05/17] stm32 CORDIC: ZeroOverhead for q1.31 and q1.15

---
 embassy-stm32/src/cordic/enums.rs |  13 -
 embassy-stm32/src/cordic/mod.rs   | 468 ++++++++++++++++++------------
 embassy-stm32/src/cordic/utils.rs |   4 +-
 3 files changed, 278 insertions(+), 207 deletions(-)

diff --git a/embassy-stm32/src/cordic/enums.rs b/embassy-stm32/src/cordic/enums.rs
index 3e1c47f7f..37c73f549 100644
--- a/embassy-stm32/src/cordic/enums.rs
+++ b/embassy-stm32/src/cordic/enums.rs
@@ -68,16 +68,3 @@ pub enum Width {
     Bits32,
     Bits16,
 }
-
-/// Cordic driver running mode
-#[derive(Clone, Copy)]
-pub enum Mode {
-    /// After caculation start, a read to RDATA register will block AHB until the caculation finished
-    ZeroOverhead,
-
-    /// Use CORDIC interrupt to trigger a read result value
-    Interrupt,
-
-    /// Use DMA to write/read value
-    Dma,
-}
diff --git a/embassy-stm32/src/cordic/mod.rs b/embassy-stm32/src/cordic/mod.rs
index 997ace113..61277d7e1 100644
--- a/embassy-stm32/src/cordic/mod.rs
+++ b/embassy-stm32/src/cordic/mod.rs
@@ -1,8 +1,9 @@
 //! CORDIC co-processor
 
-use crate::peripherals;
 use embassy_hal_internal::{into_ref, Peripheral, PeripheralRef};
 
+use crate::peripherals;
+
 mod enums;
 pub use enums::*;
 
@@ -10,10 +11,6 @@ pub mod utils;
 
 pub(crate) mod sealed;
 
-// length of pre-allocated [u32] memory for CORDIC input,
-// length should be multiple of 2
-const INPUT_BUF_LEN: usize = 8;
-
 /// Low-level CORDIC access.
 #[cfg(feature = "unstable-pac")]
 pub mod low_level {
@@ -31,30 +28,16 @@ pub trait Instance: sealed::Instance + Peripheral<P = Self> + crate::rcc::RccPer
 
 /// CORDIC configuration
 pub struct Config {
-    mode: Mode,
     function: Function,
     precision: Precision,
     scale: Scale,
     first_result: bool,
 }
 
-// CORDIC running state
-struct State {
-    input_buf: [u32; INPUT_BUF_LEN],
-    buf_index: usize,
-}
-
 impl Config {
     /// Create a config for Cordic driver
-    pub fn new(
-        mode: Mode,
-        function: Function,
-        precision: Option<Precision>,
-        scale: Option<Scale>,
-        first_result: bool,
-    ) -> Self {
+    pub fn new(function: Function, precision: Option<Precision>, scale: Option<Scale>, first_result: bool) -> Self {
         Self {
-            mode,
             function,
             precision: precision.unwrap_or_default(),
             scale: scale.unwrap_or_default(),
@@ -133,22 +116,123 @@ impl<'d, T: Instance> Cordic<'d, T> {
         } else {
             self.peri.set_result_count(Count::Two)
         }
+    }
 
-        match self.config.mode {
-            Mode::ZeroOverhead => (),
-            Mode::Interrupt => {
-                self.peri.enable_irq();
-            }
-            Mode::Dma => {
-                self.peri.enable_write_dma();
-                self.peri.enable_read_dma();
-            }
+    fn blocking_read_f32(&mut self) -> (f32, Option<f32>) {
+        let reg_value = self.peri.read_result();
+
+        let res1 = utils::q1_15_to_f32((reg_value & ((1u32 << 16) - 1)) as u16);
+
+        // We don't care about whether the function return 1 or 2 results,
+        // the only thing matter is whether user want 1 or 2 results.
+        let res2 = if !self.config.first_result {
+            Some(utils::q1_15_to_f32((reg_value >> 16) as u16))
+        } else {
+            None
+        };
+
+        (res1, res2)
+    }
+}
+
+impl<'d, T: Instance> Drop for Cordic<'d, T> {
+    fn drop(&mut self) {
+        T::disable();
+    }
+}
+
+// q1.31 related
+impl<'d, T: Instance> Cordic<'d, T> {
+    /// Run a CORDIC calculation
+    pub fn blocking_calc_32bit(&mut self, arg1s: &[f64], arg2s: Option<&[f64]>, output: &mut [f64]) -> usize {
+        if arg1s.is_empty() {
+            return 0;
         }
+
+        assert!(
+            match self.config.first_result {
+                true => output.len() >= arg1s.len(),
+                false => output.len() >= 2 * arg1s.len(),
+            },
+            "Output buf length is not long enough"
+        );
+
+        self.check_input_f64(arg1s, arg2s);
+
+        self.peri.disable_irq();
+        self.peri.disable_write_dma();
+        self.peri.disable_read_dma();
+
+        self.peri.set_result_count(if self.config.first_result {
+            Count::One
+        } else {
+            Count::Two
+        });
+
+        self.peri.set_data_width(Width::Bits32, Width::Bits32);
+
+        let mut output_count = 0;
+
+        let mut consumed_input_len = 0;
+
+        // put double input into cordic
+        if arg2s.is_some() && !arg2s.expect("It's infailable").is_empty() {
+            let arg2s = arg2s.expect("It's infailable");
+
+            self.peri.set_argument_count(Count::Two);
+
+            // Skip 1st value from arg1s, this value will be manually "preload" to cordic, to make use of cordic preload function.
+            // And we preserve last value from arg2s, since it need to manually write to cordic, and read the result out.
+            let double_input = arg1s.iter().skip(1).zip(&arg2s[..arg2s.len() - 1]);
+            // Since we preload 1st value from arg1s, the consumed input length is double_input length + 1.
+            consumed_input_len = double_input.len() + 1;
+
+            // preload first value from arg1 to cordic
+            self.blocking_write_f64(arg1s[0]);
+
+            for (&arg1, &arg2) in double_input {
+                // Since we manually preload a value before,
+                // we will write arg2 (from the actual last pair) first, (at this moment, cordic start to calculating,)
+                // and write arg1 (from the actual next pair), then read the result, to "keep preloading"
+
+                self.blocking_write_f64(arg2);
+                self.blocking_write_f64(arg1);
+                self.blocking_read_f64_to_buf(output, &mut output_count);
+            }
+
+            // write last input value from arg2s, then read out the result
+            self.blocking_write_f64(arg2s[arg2s.len() - 1]);
+            self.blocking_read_f64_to_buf(output, &mut output_count);
+        }
+
+        // put single input into cordic
+        let input_left = &arg1s[consumed_input_len..];
+
+        if !input_left.is_empty() {
+            self.peri.set_argument_count(Count::One);
+
+            // "preload" value to cordic (at this moment, cordic start to calculating)
+            self.blocking_write_f64(input_left[0]);
+
+            for &arg in input_left.iter().skip(1) {
+                // this line write arg for next round caculation to cordic,
+                // and read result from last round
+                self.blocking_write_f64(arg);
+                self.blocking_read_f64_to_buf(output, &mut output_count);
+            }
+
+            // read the last output
+            self.blocking_read_f64_to_buf(output, &mut output_count);
+        }
+
+        output_count
     }
 
     fn blocking_read_f64(&mut self) -> (f64, Option<f64>) {
         let res1 = utils::q1_31_to_f64(self.peri.read_result());
 
+        // We don't care about whether the function return 1 or 2 results,
+        // the only thing matter is whether user want 1 or 2 results.
         let res2 = if !self.config.first_result {
             Some(utils::q1_31_to_f64(self.peri.read_result()))
         } else {
@@ -174,16 +258,14 @@ impl<'d, T: Instance> Cordic<'d, T> {
     }
 }
 
-impl<'d, T: Instance> Drop for Cordic<'d, T> {
-    fn drop(&mut self) {
-        T::disable();
-    }
-}
-
-// q1.31 related
+// q1.15 related
 impl<'d, T: Instance> Cordic<'d, T> {
     /// Run a CORDIC calculation
-    pub fn calc_32bit(&mut self, arg1s: &[f64], arg2s: Option<&[f64]>, output: &mut [f64]) -> usize {
+    pub fn blocking_calc_16bit(&mut self, arg1s: &[f32], arg2s: Option<&[f32]>, output: &mut [f32]) -> usize {
+        if arg1s.is_empty() {
+            return 0;
+        }
+
         assert!(
             match self.config.first_result {
                 true => output.len() >= arg1s.len(),
@@ -192,180 +274,182 @@ impl<'d, T: Instance> Cordic<'d, T> {
             "Output buf length is not long enough"
         );
 
-        self.check_input_f64(arg1s, arg2s);
+        self.check_input_f32(arg1s, arg2s);
 
-        self.peri.set_result_count(if self.config.first_result {
-            Count::One
-        } else {
-            Count::Two
-        });
+        self.peri.disable_irq();
+        self.peri.disable_write_dma();
+        self.peri.disable_read_dma();
 
-        self.peri.set_data_width(Width::Bits32, Width::Bits32);
+        // In q1.15 mode, 1 write/read to access 2 arguments/results
+        self.peri.set_argument_count(Count::One);
+        self.peri.set_result_count(Count::One);
+
+        self.peri.set_data_width(Width::Bits16, Width::Bits16);
 
         let mut output_count = 0;
 
-        let mut consumed_input_len = 0;
+        // In q1.15 mode, we always fill 1 pair of 16bit value into WDATA register.
+        // If arg2s is None or empty array, we assume arg2 value always 1.0 (as reset value for ARG2).
+        // If arg2s has some value, and but not as long as arg1s,
+        // we fill the reset of arg2 values with last value from arg2s (as q1.31 version does)
 
-        match self.config.mode {
-            Mode::ZeroOverhead => {
-                // put double input into cordic
-                if arg2s.is_some() && !arg2s.unwrap().is_empty() {
-                    let arg2s = arg2s.unwrap();
+        let arg2_default_value = match arg2s {
+            Some(arg2s) if !arg2s.is_empty() => arg2s[arg2s.len() - 1],
+            _ => 1.0,
+        };
 
-                    self.peri.set_argument_count(Count::Two);
+        let mut args = arg1s.iter().zip(
+            arg2s
+                .unwrap_or(&[])
+                .iter()
+                .chain(core::iter::repeat(&arg2_default_value)),
+        );
 
-                    // Skip 1st value from arg1s, this value will be manually "preload" to cordic, to make use of cordic preload function.
-                    // And we preserve last value from arg2s, since it need to manually write to cordic, and read the result out.
-                    let double_input = arg1s.iter().skip(1).zip(&arg2s[..arg2s.len() - 1]);
-                    // Since we preload 1st value from arg1s, the consumed input length is double_input length + 1.
-                    consumed_input_len = double_input.len() + 1;
+        let (&arg1, &arg2) = args
+            .next()
+            .expect("This should be infallible, since arg1s is not empty");
 
-                    // preload first value from arg1 to cordic
-                    self.blocking_write_f64(arg1s[0]);
+        // preloading 1 pair of arguments
+        self.blocking_write_f32(arg1, arg2);
 
-                    for (&arg1, &arg2) in double_input {
-                        // Since we manually preload a value before,
-                        // we will write arg2 (from the actual last pair) first, (at this moment, cordic start to calculating,)
-                        // and write arg1 (from the actual next pair), then read the result, to "keep preloading"
-
-                        self.blocking_write_f64(arg2);
-                        self.blocking_write_f64(arg1);
-                        self.blocking_read_f64_to_buf(output, &mut output_count);
-                    }
-
-                    // write last input value from arg2s, then read out the result
-                    self.blocking_write_f64(arg2s[arg2s.len() - 1]);
-                    self.blocking_read_f64_to_buf(output, &mut output_count);
-                }
-
-                // put single input into cordic
-                let input_left = &arg1s[consumed_input_len..];
-
-                if !input_left.is_empty() {
-                    self.peri.set_argument_count(Count::One);
-
-                    // "preload" value to cordic (at this moment, cordic start to calculating)
-                    self.blocking_write_f64(input_left[0]);
-
-                    for &arg in input_left.iter().skip(1) {
-                        // this line write arg for next round caculation to cordic,
-                        // and read result from last round
-                        self.blocking_write_f64(arg);
-                        self.blocking_read_f64_to_buf(output, &mut output_count);
-                    }
-
-                    // read the last output
-                    self.blocking_read_f64_to_buf(output, &mut output_count);
-                }
-
-                output_count
-            }
-            Mode::Interrupt => todo!(),
-            Mode::Dma => todo!(),
+        for (&arg1, &arg2) in args {
+            self.blocking_write_f32(arg1, arg2);
+            self.blocking_read_f32_to_buf(output, &mut output_count);
         }
+
+        // read last pair of value from cordic
+        self.blocking_read_f32_to_buf(output, &mut output_count);
+
+        output_count
     }
 
-    fn check_input_f64(&self, arg1s: &[f64], arg2s: Option<&[f64]>) {
-        let config = &self.config;
+    fn blocking_write_f32(&mut self, arg1: f32, arg2: f32) {
+        let reg_value: u32 = utils::f32_to_q1_15(arg1) as u32 + ((utils::f32_to_q1_15(arg2) as u32) << 16);
+        self.peri.write_argument(reg_value);
+    }
 
-        use Function::*;
+    fn blocking_read_f32_to_buf(&mut self, result_buf: &mut [f32], result_index: &mut usize) {
+        let (res1, res2) = self.blocking_read_f32();
+        result_buf[*result_index] = res1;
+        *result_index += 1;
 
-        // check SCALE value
-        match config.function {
-            Cos | Sin | Phase | Modulus => assert!(Scale::A1_R1 == config.scale, "SCALE should be 0"),
-            Arctan => assert!(
-                (0..=7).contains(&(config.scale as u8)),
-                "SCALE should be: 0 <= SCALE <= 7"
-            ),
-            Cosh | Sinh | Arctanh => assert!(Scale::A1o2_R2 == config.scale, "SCALE should be 1"),
-
-            Ln => assert!(
-                (1..=4).contains(&(config.scale as u8)),
-                "SCALE should be: 1 <= SCALE <= 4"
-            ),
-            Sqrt => assert!(
-                (0..=2).contains(&(config.scale as u8)),
-                "SCALE should be: 0 <= SCALE <= 2"
-            ),
-        }
-
-        // check ARG1 value
-        match config.function {
-            Cos | Sin | Phase | Modulus | Arctan => {
-                assert!(
-                    arg1s.iter().all(|v| (-1.0..=1.0).contains(v)),
-                    "ARG1 should be: -1 <= ARG1 <= 1"
-                );
-            }
-
-            Cosh | Sinh => assert!(
-                arg1s.iter().all(|v| (-0.559..=0.559).contains(v)),
-                "ARG1 should be: -0.559 <= ARG1 <= 0.559"
-            ),
-
-            Arctanh => assert!(
-                arg1s.iter().all(|v| (-0.403..=0.403).contains(v)),
-                "ARG1 should be: -0.403 <= ARG1 <= 0.403"
-            ),
-
-            Ln => {
-                match config.scale {
-                    Scale::A1o2_R2 => assert!(
-                        arg1s.iter().all(|v| (0.05354..0.5).contains(v)),
-                        "When SCALE set to 1, ARG1 should be: 0.05354 <= ARG1 < 0.5"
-                    ),
-                    Scale::A1o4_R4 => assert!(
-                        arg1s.iter().all(|v| (0.25..0.75).contains(v)),
-                        "When SCALE set to 2, ARG1 should be: 0.25 <= ARG1 < 0.75"
-                    ),
-                    Scale::A1o8_R8 => assert!(
-                        arg1s.iter().all(|v| (0.375..0.875).contains(v)),
-                        "When SCALE set to 3, ARG1 should be: 0.375 <= ARG1 < 0.875"
-                    ),
-                    Scale::A1o16_R16 => assert!(
-                        arg1s.iter().all(|v| (0.4375f64..0.584f64).contains(v)),
-                        "When SCALE set to 4, ARG1 should be: 0.4375 <= ARG1 < 0.584"
-                    ),
-                    _ => unreachable!(),
-                };
-            }
-
-            Function::Sqrt => match config.scale {
-                Scale::A1_R1 => assert!(
-                    arg1s.iter().all(|v| (0.027..0.75).contains(v)),
-                    "When SCALE set to 0, ARG1 should be: 0.027 <= ARG1 < 0.75"
-                ),
-                Scale::A1o2_R2 => assert!(
-                    arg1s.iter().all(|v| (0.375..0.875).contains(v)),
-                    "When SCALE set to 1, ARG1 should be: 0.375 <= ARG1 < 0.875"
-                ),
-                Scale::A1o4_R4 => assert!(
-                    arg1s.iter().all(|v| (0.4375..0.585).contains(v)),
-                    "When SCALE set to 2, ARG1 should be: 0.4375  <= ARG1 < 0.585"
-                ),
-                _ => unreachable!(),
-            },
-        }
-
-        // check ARG2 value
-        if let Some(arg2s) = arg2s {
-            match config.function {
-                Cos | Sin => assert!(
-                    arg2s.iter().all(|v| (0.0..=1.0).contains(v)),
-                    "ARG2 should be: 0 <= ARG2 <= 1"
-                ),
-
-                Phase | Modulus => assert!(
-                    arg2s.iter().all(|v| (-1.0..=1.0).contains(v)),
-                    "ARG2 should be: -1 <= ARG2 <= 1"
-                ),
-
-                _ => (),
-            }
+        if let Some(res2) = res2 {
+            result_buf[*result_index] = res2;
+            *result_index += 1;
         }
     }
 }
 
+// check input value ARG1, ARG2, SCALE and FUNCTION are compatible with each other
+macro_rules! check_input_value {
+    ($func_name:ident, $float_type:ty) => {
+        impl<'d, T: Instance> Cordic<'d, T> {
+            fn $func_name(&self, arg1s: &[$float_type], arg2s: Option<&[$float_type]>) {
+                let config = &self.config;
+
+                use Function::*;
+
+                // check SCALE value
+                match config.function {
+                    Cos | Sin | Phase | Modulus => assert!(Scale::A1_R1 == config.scale, "SCALE should be 0"),
+                    Arctan => assert!(
+                        (0..=7).contains(&(config.scale as u8)),
+                        "SCALE should be: 0 <= SCALE <= 7"
+                    ),
+                    Cosh | Sinh | Arctanh => assert!(Scale::A1o2_R2 == config.scale, "SCALE should be 1"),
+
+                    Ln => assert!(
+                        (1..=4).contains(&(config.scale as u8)),
+                        "SCALE should be: 1 <= SCALE <= 4"
+                    ),
+                    Sqrt => assert!(
+                        (0..=2).contains(&(config.scale as u8)),
+                        "SCALE should be: 0 <= SCALE <= 2"
+                    ),
+                }
+
+                // check ARG1 value
+                match config.function {
+                    Cos | Sin | Phase | Modulus | Arctan => {
+                        assert!(
+                            arg1s.iter().all(|v| (-1.0..=1.0).contains(v)),
+                            "ARG1 should be: -1 <= ARG1 <= 1"
+                        );
+                    }
+
+                    Cosh | Sinh => assert!(
+                        arg1s.iter().all(|v| (-0.559..=0.559).contains(v)),
+                        "ARG1 should be: -0.559 <= ARG1 <= 0.559"
+                    ),
+
+                    Arctanh => assert!(
+                        arg1s.iter().all(|v| (-0.403..=0.403).contains(v)),
+                        "ARG1 should be: -0.403 <= ARG1 <= 0.403"
+                    ),
+
+                    Ln => {
+                        match config.scale {
+                            Scale::A1o2_R2 => assert!(
+                                arg1s.iter().all(|v| (0.05354..0.5).contains(v)),
+                                "When SCALE set to 1, ARG1 should be: 0.05354 <= ARG1 < 0.5"
+                            ),
+                            Scale::A1o4_R4 => assert!(
+                                arg1s.iter().all(|v| (0.25..0.75).contains(v)),
+                                "When SCALE set to 2, ARG1 should be: 0.25 <= ARG1 < 0.75"
+                            ),
+                            Scale::A1o8_R8 => assert!(
+                                arg1s.iter().all(|v| (0.375..0.875).contains(v)),
+                                "When SCALE set to 3, ARG1 should be: 0.375 <= ARG1 < 0.875"
+                            ),
+                            Scale::A1o16_R16 => assert!(
+                                arg1s.iter().all(|v| (0.4375..0.584).contains(v)),
+                                "When SCALE set to 4, ARG1 should be: 0.4375 <= ARG1 < 0.584"
+                            ),
+                            _ => unreachable!(),
+                        };
+                    }
+
+                    Function::Sqrt => match config.scale {
+                        Scale::A1_R1 => assert!(
+                            arg1s.iter().all(|v| (0.027..0.75).contains(v)),
+                            "When SCALE set to 0, ARG1 should be: 0.027 <= ARG1 < 0.75"
+                        ),
+                        Scale::A1o2_R2 => assert!(
+                            arg1s.iter().all(|v| (0.375..0.875).contains(v)),
+                            "When SCALE set to 1, ARG1 should be: 0.375 <= ARG1 < 0.875"
+                        ),
+                        Scale::A1o4_R4 => assert!(
+                            arg1s.iter().all(|v| (0.4375..0.585).contains(v)),
+                            "When SCALE set to 2, ARG1 should be: 0.4375  <= ARG1 < 0.585"
+                        ),
+                        _ => unreachable!(),
+                    },
+                }
+
+                // check ARG2 value
+                if let Some(arg2s) = arg2s {
+                    match config.function {
+                        Cos | Sin => assert!(
+                            arg2s.iter().all(|v| (0.0..=1.0).contains(v)),
+                            "ARG2 should be: 0 <= ARG2 <= 1"
+                        ),
+
+                        Phase | Modulus => assert!(
+                            arg2s.iter().all(|v| (-1.0..=1.0).contains(v)),
+                            "ARG2 should be: -1 <= ARG2 <= 1"
+                        ),
+
+                        _ => (),
+                    }
+                }
+            }
+        }
+    };
+}
+
+check_input_value!(check_input_f64, f64);
+check_input_value!(check_input_f32, f32);
+
 foreach_interrupt!(
     ($inst:ident, cordic, $block:ident, GLOBAL, $irq:ident) => {
         impl Instance for peripherals::$inst {
diff --git a/embassy-stm32/src/cordic/utils.rs b/embassy-stm32/src/cordic/utils.rs
index 3f055c34b..2f4b5c5e8 100644
--- a/embassy-stm32/src/cordic/utils.rs
+++ b/embassy-stm32/src/cordic/utils.rs
@@ -3,7 +3,7 @@
 macro_rules! floating_fixed_convert {
     ($f_to_q:ident, $q_to_f:ident, $unsigned_bin_typ:ty, $signed_bin_typ:ty, $float_ty:ty, $offset:literal, $min_positive:literal) => {
         /// convert float point to fixed point format
-        pub fn $f_to_q(value: $float_ty) -> $unsigned_bin_typ {
+        pub(crate) fn $f_to_q(value: $float_ty) -> $unsigned_bin_typ {
             const MIN_POSITIVE: $float_ty = unsafe { core::mem::transmute($min_positive) };
 
             assert!(
@@ -31,7 +31,7 @@ macro_rules! floating_fixed_convert {
 
         #[inline(always)]
         /// convert fixed point to float point format
-        pub fn $q_to_f(value: $unsigned_bin_typ) -> $float_ty {
+        pub(crate) fn $q_to_f(value: $unsigned_bin_typ) -> $float_ty {
             // It's needed to convert from unsigned to signed first, for correct result.
             -(value as $signed_bin_typ as $float_ty) / ((1 as $unsigned_bin_typ << $offset) as $float_ty)
         }

From 2fa04d93ed93bed97c7575019aea32c2543e322c Mon Sep 17 00:00:00 2001
From: eZio Pan <eziopan@qq.com>
Date: Mon, 18 Mar 2024 23:09:18 +0800
Subject: [PATCH 06/17] stm32 CORDIC: DMA for q1.31

---
 embassy-stm32/build.rs          |   4 +-
 embassy-stm32/src/cordic/mod.rs | 207 ++++++++++++++++++++++++++++++--
 2 files changed, 199 insertions(+), 12 deletions(-)

diff --git a/embassy-stm32/build.rs b/embassy-stm32/build.rs
index 15bb8ea62..e224cc5a2 100644
--- a/embassy-stm32/build.rs
+++ b/embassy-stm32/build.rs
@@ -484,7 +484,7 @@ fn main() {
                 let expr = if let Some(mux) = self.chained_muxes.get(&v.name) {
                     self.gen_mux(mux)
                 } else {
-                    self.gen_clock(&v.name)
+                    self.gen_clock(v.name)
                 };
                 match_arms.extend(quote! {
                     crate::pac::rcc::vals::#enum_name::#variant_name => #expr,
@@ -1139,6 +1139,8 @@ fn main() {
         (("timer", "CH2"), quote!(crate::timer::Ch2Dma)),
         (("timer", "CH3"), quote!(crate::timer::Ch3Dma)),
         (("timer", "CH4"), quote!(crate::timer::Ch4Dma)),
+        (("cordic", "WRITE"), quote!(crate::cordic::WriteDma)),
+        (("cordic", "READ"), quote!(crate::cordic::ReadDma)),
     ]
     .into();
 
diff --git a/embassy-stm32/src/cordic/mod.rs b/embassy-stm32/src/cordic/mod.rs
index 61277d7e1..9875d73bb 100644
--- a/embassy-stm32/src/cordic/mod.rs
+++ b/embassy-stm32/src/cordic/mod.rs
@@ -2,7 +2,7 @@
 
 use embassy_hal_internal::{into_ref, Peripheral, PeripheralRef};
 
-use crate::peripherals;
+use crate::{dma, peripherals};
 
 mod enums;
 pub use enums::*;
@@ -17,6 +17,8 @@ pub mod low_level {
     pub use super::sealed::*;
 }
 
+const INPUT_BUF_MAX_LEN: usize = 16;
+
 /// CORDIC driver
 pub struct Cordic<'d, T: Instance> {
     peri: PeripheralRef<'d, T>,
@@ -98,7 +100,6 @@ impl<'d, T: Instance> Cordic<'d, T> {
             warn!("At least 1 result hasn't been read, reconfigure will cause DATA LOST");
         };
 
-        self.peri.disable_irq();
         self.peri.disable_write_dma();
         self.peri.disable_read_dma();
 
@@ -111,11 +112,8 @@ impl<'d, T: Instance> Cordic<'d, T> {
         self.peri.set_precision(self.config.precision);
         self.peri.set_scale(self.config.scale);
 
-        if self.config.first_result {
-            self.peri.set_result_count(Count::One)
-        } else {
-            self.peri.set_result_count(Count::Two)
-        }
+        // we don't set NRES in here, but to make sure NRES is set each time user call "calc"-ish functions,
+        // since each "calc"-ish functions can have different ARGSIZE and RESSIZE, thus NRES should be change accrodingly.
     }
 
     fn blocking_read_f32(&mut self) -> (f32, Option<f32>) {
@@ -143,7 +141,7 @@ impl<'d, T: Instance> Drop for Cordic<'d, T> {
 
 // q1.31 related
 impl<'d, T: Instance> Cordic<'d, T> {
-    /// Run a CORDIC calculation
+    /// Run a blocking CORDIC calculation
     pub fn blocking_calc_32bit(&mut self, arg1s: &[f64], arg2s: Option<&[f64]>, output: &mut [f64]) -> usize {
         if arg1s.is_empty() {
             return 0;
@@ -159,7 +157,6 @@ impl<'d, T: Instance> Cordic<'d, T> {
 
         self.check_input_f64(arg1s, arg2s);
 
-        self.peri.disable_irq();
         self.peri.disable_write_dma();
         self.peri.disable_read_dma();
 
@@ -256,6 +253,192 @@ impl<'d, T: Instance> Cordic<'d, T> {
     fn blocking_write_f64(&mut self, arg: f64) {
         self.peri.write_argument(utils::f64_to_q1_31(arg));
     }
+
+    /// Run a async CORDIC calculation
+    pub async fn async_calc_32bit(
+        &mut self,
+        write_dma: impl Peripheral<P = impl WriteDma<T>>,
+        read_dma: impl Peripheral<P = impl ReadDma<T>>,
+        arg1s: &[f64],
+        arg2s: Option<&[f64]>,
+        output: &mut [f64],
+    ) -> usize {
+        if arg1s.is_empty() {
+            return 0;
+        }
+
+        assert!(
+            match self.config.first_result {
+                true => output.len() >= arg1s.len(),
+                false => output.len() >= 2 * arg1s.len(),
+            },
+            "Output buf length is not long enough"
+        );
+
+        self.check_input_f64(arg1s, arg2s);
+
+        into_ref!(write_dma, read_dma);
+
+        self.peri.set_result_count(if self.config.first_result {
+            Count::One
+        } else {
+            Count::Two
+        });
+
+        self.peri.set_data_width(Width::Bits32, Width::Bits32);
+
+        let mut output_count = 0;
+        let mut consumed_input_len = 0;
+        let mut input_buf = [0u32; INPUT_BUF_MAX_LEN];
+        let mut input_buf_len = 0;
+
+        self.peri.enable_write_dma();
+        self.peri.enable_read_dma();
+
+        if !arg2s.unwrap_or_default().is_empty() {
+            let arg2s = arg2s.expect("It's infailable");
+
+            self.peri.set_argument_count(Count::Two);
+
+            let double_input = arg1s.iter().zip(arg2s);
+
+            consumed_input_len = double_input.len();
+
+            for (&arg1, &arg2) in double_input {
+                for &arg in [arg1, arg2].iter() {
+                    input_buf[input_buf_len] = utils::f64_to_q1_31(arg);
+                    input_buf_len += 1;
+                }
+
+                if input_buf_len == INPUT_BUF_MAX_LEN {
+                    self.dma_calc_32bit(
+                        &mut write_dma,
+                        &mut read_dma,
+                        true,
+                        &input_buf[..input_buf_len],
+                        output,
+                        &mut output_count,
+                    )
+                    .await;
+
+                    input_buf_len = 0;
+                }
+            }
+
+            if input_buf_len % 2 != 0 {
+                panic!("input buf len should be multiple of 2 in double mode")
+            }
+
+            if input_buf_len > 0 {
+                self.dma_calc_32bit(
+                    &mut write_dma,
+                    &mut read_dma,
+                    true,
+                    &input_buf[..input_buf_len],
+                    output,
+                    &mut output_count,
+                )
+                .await;
+
+                input_buf_len = 0;
+            }
+        }
+
+        // single input
+
+        if arg1s.len() > consumed_input_len {
+            let input_remain = &arg1s[consumed_input_len..];
+
+            self.peri.set_argument_count(Count::One);
+
+            for &arg in input_remain {
+                input_buf[input_buf_len] = utils::f64_to_q1_31(arg);
+                input_buf_len += 1;
+
+                if input_buf_len == INPUT_BUF_MAX_LEN {
+                    self.dma_calc_32bit(
+                        &mut write_dma,
+                        &mut read_dma,
+                        false,
+                        &input_buf[..input_buf_len],
+                        output,
+                        &mut output_count,
+                    )
+                    .await;
+
+                    input_buf_len = 0;
+                }
+            }
+
+            if input_buf_len > 0 {
+                self.dma_calc_32bit(
+                    &mut write_dma,
+                    &mut read_dma,
+                    false,
+                    &input_buf[..input_buf_len],
+                    output,
+                    &mut output_count,
+                )
+                .await;
+
+                // input_buf_len = 0;
+            }
+        }
+
+        output_count
+    }
+
+    async fn dma_calc_32bit(
+        &mut self,
+        write_dma: impl Peripheral<P = impl WriteDma<T>>,
+        read_dma: impl Peripheral<P = impl ReadDma<T>>,
+        double_input: bool,
+        input_buf: &[u32],
+        output: &mut [f64],
+        output_start_index: &mut usize,
+    ) {
+        into_ref!(write_dma, read_dma);
+
+        let write_req = write_dma.request();
+        let read_req = read_dma.request();
+
+        let mut output_buf = [0u32; INPUT_BUF_MAX_LEN * 2]; // make output_buf long enough
+
+        let mut output_buf_size = input_buf.len();
+        if !self.config.first_result {
+            output_buf_size *= 2;
+        };
+        if double_input {
+            output_buf_size /= 2;
+        }
+
+        let active_output_buf = &mut output_buf[..output_buf_size];
+
+        unsafe {
+            let write_transfer = dma::Transfer::new_write(
+                &mut write_dma,
+                write_req,
+                input_buf,
+                T::regs().wdata().as_ptr() as *mut _,
+                Default::default(),
+            );
+
+            let read_transfer = dma::Transfer::new_read(
+                &mut read_dma,
+                read_req,
+                T::regs().rdata().as_ptr() as *mut _,
+                active_output_buf,
+                Default::default(),
+            );
+
+            embassy_futures::join::join(write_transfer, read_transfer).await;
+        }
+
+        for &mut output_u32 in active_output_buf {
+            output[*output_start_index] = utils::q1_31_to_f64(output_u32);
+            *output_start_index += 1;
+        }
+    }
 }
 
 // q1.15 related
@@ -276,7 +459,6 @@ impl<'d, T: Instance> Cordic<'d, T> {
 
         self.check_input_f32(arg1s, arg2s);
 
-        self.peri.disable_irq();
         self.peri.disable_write_dma();
         self.peri.disable_read_dma();
 
@@ -409,7 +591,7 @@ macro_rules! check_input_value {
                         };
                     }
 
-                    Function::Sqrt => match config.scale {
+                    Sqrt => match config.scale {
                         Scale::A1_R1 => assert!(
                             arg1s.iter().all(|v| (0.027..0.75).contains(v)),
                             "When SCALE set to 0, ARG1 should be: 0.027 <= ARG1 < 0.75"
@@ -462,3 +644,6 @@ foreach_interrupt!(
         }
     };
 );
+
+dma_trait!(WriteDma, Instance);
+dma_trait!(ReadDma, Instance);

From 10a9cce855fbf383a8f0ea5511526777062a03c4 Mon Sep 17 00:00:00 2001
From: eZio Pan <eziopan@qq.com>
Date: Tue, 19 Mar 2024 20:09:36 +0800
Subject: [PATCH 07/17] stm32 CORDIC: DMA for q1.31 and q1.15

---
 embassy-stm32/src/cordic/mod.rs   | 264 ++++++++++++++++++++++--------
 embassy-stm32/src/cordic/utils.rs |  13 ++
 2 files changed, 209 insertions(+), 68 deletions(-)

diff --git a/embassy-stm32/src/cordic/mod.rs b/embassy-stm32/src/cordic/mod.rs
index 9875d73bb..a4b98a770 100644
--- a/embassy-stm32/src/cordic/mod.rs
+++ b/embassy-stm32/src/cordic/mod.rs
@@ -1,5 +1,6 @@
 //! CORDIC co-processor
 
+use embassy_hal_internal::drop::OnDrop;
 use embassy_hal_internal::{into_ref, Peripheral, PeripheralRef};
 
 use crate::{dma, peripherals};
@@ -100,9 +101,6 @@ impl<'d, T: Instance> Cordic<'d, T> {
             warn!("At least 1 result hasn't been read, reconfigure will cause DATA LOST");
         };
 
-        self.peri.disable_write_dma();
-        self.peri.disable_read_dma();
-
         // clean RRDY flag
         while self.peri.ready_to_read() {
             self.peri.read_result();
@@ -115,22 +113,6 @@ impl<'d, T: Instance> Cordic<'d, T> {
         // we don't set NRES in here, but to make sure NRES is set each time user call "calc"-ish functions,
         // since each "calc"-ish functions can have different ARGSIZE and RESSIZE, thus NRES should be change accrodingly.
     }
-
-    fn blocking_read_f32(&mut self) -> (f32, Option<f32>) {
-        let reg_value = self.peri.read_result();
-
-        let res1 = utils::q1_15_to_f32((reg_value & ((1u32 << 16) - 1)) as u16);
-
-        // We don't care about whether the function return 1 or 2 results,
-        // the only thing matter is whether user want 1 or 2 results.
-        let res2 = if !self.config.first_result {
-            Some(utils::q1_15_to_f32((reg_value >> 16) as u16))
-        } else {
-            None
-        };
-
-        (res1, res2)
-    }
 }
 
 impl<'d, T: Instance> Drop for Cordic<'d, T> {
@@ -141,7 +123,7 @@ impl<'d, T: Instance> Drop for Cordic<'d, T> {
 
 // q1.31 related
 impl<'d, T: Instance> Cordic<'d, T> {
-    /// Run a blocking CORDIC calculation
+    /// Run a blocking CORDIC calculation in q1.31 format
     pub fn blocking_calc_32bit(&mut self, arg1s: &[f64], arg2s: Option<&[f64]>, output: &mut [f64]) -> usize {
         if arg1s.is_empty() {
             return 0;
@@ -157,9 +139,6 @@ impl<'d, T: Instance> Cordic<'d, T> {
 
         self.check_input_f64(arg1s, arg2s);
 
-        self.peri.disable_write_dma();
-        self.peri.disable_read_dma();
-
         self.peri.set_result_count(if self.config.first_result {
             Count::One
         } else {
@@ -172,7 +151,10 @@ impl<'d, T: Instance> Cordic<'d, T> {
 
         let mut consumed_input_len = 0;
 
-        // put double input into cordic
+        //
+        // handle 2 input args calculation
+        //
+
         if arg2s.is_some() && !arg2s.expect("It's infailable").is_empty() {
             let arg2s = arg2s.expect("It's infailable");
 
@@ -202,7 +184,10 @@ impl<'d, T: Instance> Cordic<'d, T> {
             self.blocking_read_f64_to_buf(output, &mut output_count);
         }
 
-        // put single input into cordic
+        //
+        // handle 1 input arg calculation
+        //
+
         let input_left = &arg1s[consumed_input_len..];
 
         if !input_left.is_empty() {
@@ -225,27 +210,14 @@ impl<'d, T: Instance> Cordic<'d, T> {
         output_count
     }
 
-    fn blocking_read_f64(&mut self) -> (f64, Option<f64>) {
-        let res1 = utils::q1_31_to_f64(self.peri.read_result());
+    fn blocking_read_f64_to_buf(&mut self, result_buf: &mut [f64], result_index: &mut usize) {
+        result_buf[*result_index] = utils::q1_31_to_f64(self.peri.read_result());
+        *result_index += 1;
 
         // We don't care about whether the function return 1 or 2 results,
         // the only thing matter is whether user want 1 or 2 results.
-        let res2 = if !self.config.first_result {
-            Some(utils::q1_31_to_f64(self.peri.read_result()))
-        } else {
-            None
-        };
-
-        (res1, res2)
-    }
-
-    fn blocking_read_f64_to_buf(&mut self, result_buf: &mut [f64], result_index: &mut usize) {
-        let (res1, res2) = self.blocking_read_f64();
-        result_buf[*result_index] = res1;
-        *result_index += 1;
-
-        if let Some(res2) = res2 {
-            result_buf[*result_index] = res2;
+        if !self.config.first_result {
+            result_buf[*result_index] = utils::q1_31_to_f64(self.peri.read_result());
             *result_index += 1;
         }
     }
@@ -254,7 +226,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
         self.peri.write_argument(utils::f64_to_q1_31(arg));
     }
 
-    /// Run a async CORDIC calculation
+    /// Run a async CORDIC calculation in q.1.31 format
     pub async fn async_calc_32bit(
         &mut self,
         write_dma: impl Peripheral<P = impl WriteDma<T>>,
@@ -292,8 +264,9 @@ impl<'d, T: Instance> Cordic<'d, T> {
         let mut input_buf = [0u32; INPUT_BUF_MAX_LEN];
         let mut input_buf_len = 0;
 
-        self.peri.enable_write_dma();
-        self.peri.enable_read_dma();
+        //
+        // handle 2 input args calculation
+        //
 
         if !arg2s.unwrap_or_default().is_empty() {
             let arg2s = arg2s.expect("It's infailable");
@@ -311,7 +284,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
                 }
 
                 if input_buf_len == INPUT_BUF_MAX_LEN {
-                    self.dma_calc_32bit(
+                    self.inner_dma_calc_32bit(
                         &mut write_dma,
                         &mut read_dma,
                         true,
@@ -325,12 +298,8 @@ impl<'d, T: Instance> Cordic<'d, T> {
                 }
             }
 
-            if input_buf_len % 2 != 0 {
-                panic!("input buf len should be multiple of 2 in double mode")
-            }
-
             if input_buf_len > 0 {
-                self.dma_calc_32bit(
+                self.inner_dma_calc_32bit(
                     &mut write_dma,
                     &mut read_dma,
                     true,
@@ -344,7 +313,9 @@ impl<'d, T: Instance> Cordic<'d, T> {
             }
         }
 
-        // single input
+        //
+        // handle 1 input arg calculation
+        //
 
         if arg1s.len() > consumed_input_len {
             let input_remain = &arg1s[consumed_input_len..];
@@ -356,7 +327,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
                 input_buf_len += 1;
 
                 if input_buf_len == INPUT_BUF_MAX_LEN {
-                    self.dma_calc_32bit(
+                    self.inner_dma_calc_32bit(
                         &mut write_dma,
                         &mut read_dma,
                         false,
@@ -371,7 +342,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
             }
 
             if input_buf_len > 0 {
-                self.dma_calc_32bit(
+                self.inner_dma_calc_32bit(
                     &mut write_dma,
                     &mut read_dma,
                     false,
@@ -388,32 +359,47 @@ impl<'d, T: Instance> Cordic<'d, T> {
         output_count
     }
 
-    async fn dma_calc_32bit(
+    // this function is highly coupled with async_calc_32bit, and is not intended to use in other place
+    async fn inner_dma_calc_32bit(
         &mut self,
         write_dma: impl Peripheral<P = impl WriteDma<T>>,
         read_dma: impl Peripheral<P = impl ReadDma<T>>,
-        double_input: bool,
-        input_buf: &[u32],
-        output: &mut [f64],
-        output_start_index: &mut usize,
+        double_input: bool,             // gether extra info to calc output_buf size
+        input_buf: &[u32],              // input_buf, its content should be extact values and length for calculation
+        output: &mut [f64],             // caller uses should this as a final output array
+        output_start_index: &mut usize, // the index of start point of the output for this round of calculation
     ) {
         into_ref!(write_dma, read_dma);
 
         let write_req = write_dma.request();
         let read_req = read_dma.request();
 
-        let mut output_buf = [0u32; INPUT_BUF_MAX_LEN * 2]; // make output_buf long enough
+        // output_buf is the place to store raw value from CORDIC (via DMA).
+        // For buf size, we assume in this round of calculation:
+        // all input is 1 arg, and all calculation need 2 output,
+        // thus output_buf will always be long enough.
+        let mut output_buf = [0u32; INPUT_BUF_MAX_LEN * 2];
 
         let mut output_buf_size = input_buf.len();
         if !self.config.first_result {
+            // if we need 2 result for 1 input, then output_buf length should be 2x long.
             output_buf_size *= 2;
         };
         if double_input {
+            // if input itself is 2 args for 1 calculation, then output_buf length should be /2.
             output_buf_size /= 2;
         }
 
         let active_output_buf = &mut output_buf[..output_buf_size];
 
+        self.peri.enable_write_dma();
+        self.peri.enable_read_dma();
+
+        let on_drop = OnDrop::new(|| {
+            self.peri.disable_write_dma();
+            self.peri.disable_read_dma();
+        });
+
         unsafe {
             let write_transfer = dma::Transfer::new_write(
                 &mut write_dma,
@@ -434,6 +420,8 @@ impl<'d, T: Instance> Cordic<'d, T> {
             embassy_futures::join::join(write_transfer, read_transfer).await;
         }
 
+        drop(on_drop);
+
         for &mut output_u32 in active_output_buf {
             output[*output_start_index] = utils::q1_31_to_f64(output_u32);
             *output_start_index += 1;
@@ -443,7 +431,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
 
 // q1.15 related
 impl<'d, T: Instance> Cordic<'d, T> {
-    /// Run a CORDIC calculation
+    /// Run a blocking CORDIC calculation in q1.15 format
     pub fn blocking_calc_16bit(&mut self, arg1s: &[f32], arg2s: Option<&[f32]>, output: &mut [f32]) -> usize {
         if arg1s.is_empty() {
             return 0;
@@ -459,9 +447,6 @@ impl<'d, T: Instance> Cordic<'d, T> {
 
         self.check_input_f32(arg1s, arg2s);
 
-        self.peri.disable_write_dma();
-        self.peri.disable_read_dma();
-
         // In q1.15 mode, 1 write/read to access 2 arguments/results
         self.peri.set_argument_count(Count::One);
         self.peri.set_result_count(Count::One);
@@ -506,20 +491,163 @@ impl<'d, T: Instance> Cordic<'d, T> {
     }
 
     fn blocking_write_f32(&mut self, arg1: f32, arg2: f32) {
-        let reg_value: u32 = utils::f32_to_q1_15(arg1) as u32 + ((utils::f32_to_q1_15(arg2) as u32) << 16);
+        let reg_value: u32 = utils::f32_args_to_u32(arg1, arg2);
         self.peri.write_argument(reg_value);
     }
 
     fn blocking_read_f32_to_buf(&mut self, result_buf: &mut [f32], result_index: &mut usize) {
-        let (res1, res2) = self.blocking_read_f32();
+        let reg_value = self.peri.read_result();
+
+        let (res1, res2) = utils::u32_to_f32_res(reg_value);
+
         result_buf[*result_index] = res1;
         *result_index += 1;
 
-        if let Some(res2) = res2 {
+        // We don't care about whether the function return 1 or 2 results,
+        // the only thing matter is whether user want 1 or 2 results.
+        if !self.config.first_result {
             result_buf[*result_index] = res2;
             *result_index += 1;
         }
     }
+
+    /// Run a async CORDIC calculation in q1.15 format
+    pub async fn async_calc_16bit(
+        &mut self,
+        write_dma: impl Peripheral<P = impl WriteDma<T>>,
+        read_dma: impl Peripheral<P = impl ReadDma<T>>,
+        arg1s: &[f32],
+        arg2s: Option<&[f32]>,
+        output: &mut [f32],
+    ) -> usize {
+        if arg1s.is_empty() {
+            return 0;
+        }
+
+        assert!(
+            match self.config.first_result {
+                true => output.len() >= arg1s.len(),
+                false => output.len() >= 2 * arg1s.len(),
+            },
+            "Output buf length is not long enough"
+        );
+
+        self.check_input_f32(arg1s, arg2s);
+
+        into_ref!(write_dma, read_dma);
+
+        // In q1.15 mode, 1 write/read to access 2 arguments/results
+        self.peri.set_argument_count(Count::One);
+        self.peri.set_result_count(Count::One);
+
+        self.peri.set_data_width(Width::Bits16, Width::Bits16);
+
+        let mut output_count = 0;
+        let mut input_buf = [0u32; INPUT_BUF_MAX_LEN];
+        let mut input_buf_len = 0;
+
+        // In q1.15 mode, we always fill 1 pair of 16bit value into WDATA register.
+        // If arg2s is None or empty array, we assume arg2 value always 1.0 (as reset value for ARG2).
+        // If arg2s has some value, and but not as long as arg1s,
+        // we fill the reset of arg2 values with last value from arg2s (as q1.31 version does)
+
+        let arg2_default_value = match arg2s {
+            Some(arg2s) if !arg2s.is_empty() => arg2s[arg2s.len() - 1],
+            _ => 1.0,
+        };
+
+        let args = arg1s.iter().zip(
+            arg2s
+                .unwrap_or(&[])
+                .iter()
+                .chain(core::iter::repeat(&arg2_default_value)),
+        );
+
+        for (&arg1, &arg2) in args {
+            input_buf[input_buf_len] = utils::f32_args_to_u32(arg1, arg2);
+            input_buf_len += 1;
+
+            if input_buf_len == INPUT_BUF_MAX_LEN {
+                self.inner_dma_calc_16bit(&mut write_dma, &mut read_dma, &input_buf, output, &mut output_count)
+                    .await;
+            }
+        }
+
+        if input_buf_len > 0 {
+            self.inner_dma_calc_16bit(
+                &mut write_dma,
+                &mut read_dma,
+                &input_buf[..input_buf_len],
+                output,
+                &mut output_count,
+            )
+            .await;
+        }
+
+        output_count
+    }
+
+    // this function is highly coupled with async_calc_16bit, and is not intended to use in other place
+    async fn inner_dma_calc_16bit(
+        &mut self,
+        write_dma: impl Peripheral<P = impl WriteDma<T>>,
+        read_dma: impl Peripheral<P = impl ReadDma<T>>,
+        input_buf: &[u32],  // input_buf, its content should be extact values and length for calculation
+        output: &mut [f32], // caller uses should this as a final output array
+        output_start_index: &mut usize, // the index of start point of the output for this round of calculation
+    ) {
+        into_ref!(write_dma, read_dma);
+
+        let write_req = write_dma.request();
+        let read_req = read_dma.request();
+
+        // output_buf is the place to store raw value from CORDIC (via DMA).
+        let mut output_buf = [0u32; INPUT_BUF_MAX_LEN];
+
+        let active_output_buf = &mut output_buf[..input_buf.len()];
+
+        self.peri.enable_write_dma();
+        self.peri.enable_read_dma();
+
+        let on_drop = OnDrop::new(|| {
+            self.peri.disable_write_dma();
+            self.peri.disable_read_dma();
+        });
+
+        unsafe {
+            let write_transfer = dma::Transfer::new_write(
+                &mut write_dma,
+                write_req,
+                input_buf,
+                T::regs().wdata().as_ptr() as *mut _,
+                Default::default(),
+            );
+
+            let read_transfer = dma::Transfer::new_read(
+                &mut read_dma,
+                read_req,
+                T::regs().rdata().as_ptr() as *mut _,
+                active_output_buf,
+                Default::default(),
+            );
+
+            embassy_futures::join::join(write_transfer, read_transfer).await;
+        }
+
+        drop(on_drop);
+
+        for &mut output_u32 in active_output_buf {
+            let (res1, res2) = utils::u32_to_f32_res(output_u32);
+
+            output[*output_start_index] = res1;
+            *output_start_index += 1;
+
+            if !self.config.first_result {
+                output[*output_start_index] = res2;
+                *output_start_index += 1;
+            }
+        }
+    }
 }
 
 // check input value ARG1, ARG2, SCALE and FUNCTION are compatible with each other
diff --git a/embassy-stm32/src/cordic/utils.rs b/embassy-stm32/src/cordic/utils.rs
index 2f4b5c5e8..79bef6b97 100644
--- a/embassy-stm32/src/cordic/utils.rs
+++ b/embassy-stm32/src/cordic/utils.rs
@@ -57,3 +57,16 @@ floating_fixed_convert!(
     15,
     0x3800_0000u32 // binary form of 1f32^(-15)
 );
+
+#[inline(always)]
+pub(crate) fn f32_args_to_u32(arg1: f32, arg2: f32) -> u32 {
+    f32_to_q1_15(arg1) as u32 + ((f32_to_q1_15(arg2) as u32) << 16)
+}
+
+#[inline(always)]
+pub(crate) fn u32_to_f32_res(reg_value: u32) -> (f32, f32) {
+    let res1 = q1_15_to_f32((reg_value & ((1u32 << 16) - 1)) as u16);
+    let res2 = q1_15_to_f32((reg_value >> 16) as u16);
+
+    (res1, res2)
+}

From 641da3602e1d7565d08180e0f5608f1ab81c309a Mon Sep 17 00:00:00 2001
From: eZio Pan <eziopan@qq.com>
Date: Tue, 19 Mar 2024 22:19:06 +0800
Subject: [PATCH 08/17] stm32 CORDIC: error handle

---
 embassy-stm32/src/cordic/enums.rs  |  29 +-
 embassy-stm32/src/cordic/errors.rs |  95 ++++++
 embassy-stm32/src/cordic/mod.rs    | 510 +++++++++++++++--------------
 embassy-stm32/src/cordic/sealed.rs |  12 +-
 4 files changed, 385 insertions(+), 261 deletions(-)
 create mode 100644 embassy-stm32/src/cordic/errors.rs

diff --git a/embassy-stm32/src/cordic/enums.rs b/embassy-stm32/src/cordic/enums.rs
index 37c73f549..4b92a6cf8 100644
--- a/embassy-stm32/src/cordic/enums.rs
+++ b/embassy-stm32/src/cordic/enums.rs
@@ -1,6 +1,7 @@
 /// CORDIC function
 #[allow(missing_docs)]
-#[derive(Clone, Copy)]
+#[derive(Debug, Clone, Copy)]
+#[cfg_attr(feature = "defmt", derive(defmt::Format))]
 pub enum Function {
     Cos = 0,
     Sin,
@@ -16,7 +17,7 @@ pub enum Function {
 
 /// CORDIC precision
 #[allow(missing_docs)]
-#[derive(Clone, Copy, Default)]
+#[derive(Debug, Clone, Copy, Default)]
 pub enum Precision {
     Iters4 = 1,
     Iters8,
@@ -37,25 +38,25 @@ pub enum Precision {
 }
 
 /// CORDIC scale
-#[allow(non_camel_case_types)]
 #[allow(missing_docs)]
-#[derive(Clone, Copy, Default, PartialEq)]
+#[derive(Debug, Clone, Copy, Default, PartialEq)]
+#[cfg_attr(feature = "defmt", derive(defmt::Format))]
 pub enum Scale {
     #[default]
-    A1_R1 = 0,
-    A1o2_R2,
-    A1o4_R4,
-    A1o8_R8,
-    A1o16_R16,
-    A1o32_R32,
-    A1o64_R64,
-    A1o128_R128,
+    Arg1Res1 = 0,
+    Arg1o2Res2,
+    Arg1o4Res4,
+    Arg1o8Res8,
+    Arg1o16Res16,
+    Arg1o32Res32,
+    Arg1o64Res64,
+    Arg1o128Res128,
 }
 
-/// CORDIC argument/result count
+/// CORDIC argument/result register access count
 #[allow(missing_docs)]
 #[derive(Clone, Copy, Default)]
-pub enum Count {
+pub enum AccessCount {
     #[default]
     One,
     Two,
diff --git a/embassy-stm32/src/cordic/errors.rs b/embassy-stm32/src/cordic/errors.rs
new file mode 100644
index 000000000..d0b2dc618
--- /dev/null
+++ b/embassy-stm32/src/cordic/errors.rs
@@ -0,0 +1,95 @@
+use super::{Function, Scale};
+
+/// Error for [Cordic](super::Cordic)
+#[derive(Debug)]
+pub enum CordicError {
+    /// Config error
+    ConfigError(ConfigError),
+    /// Argument error
+    ArgError(ArgError),
+    /// Output buffer length error
+    OutputLengthNotEnough,
+}
+
+#[cfg(feature = "defmt")]
+impl defmt::Format for CordicError {
+    fn format(&self, fmt: defmt::Formatter) {
+        use CordicError::*;
+
+        match self {
+            ConfigError(e) => defmt::write!(fmt, "{}", e),
+            ArgError(e) => defmt::write!(fmt, "{}", e),
+            OutputLengthNotEnough => defmt::write!(fmt, "Output buffer length is not long enough"),
+        }
+    }
+}
+
+/// Error dring parsing [Cordic::Config](super::Config)
+#[derive(Debug)]
+pub struct ConfigError {
+    pub(super) func: Function,
+    pub(super) scale_range: [u8; 2],
+}
+
+#[cfg(feature = "defmt")]
+impl defmt::Format for ConfigError {
+    fn format(&self, fmt: defmt::Formatter) {
+        defmt::write!(fmt, "For FUNCTION: {},", self.func);
+
+        if self.scale_range[0] == self.scale_range[1] {
+            defmt::write!(fmt, " SCALE value should be {}", self.scale_range[0])
+        } else {
+            defmt::write!(
+                fmt,
+                " SCALE value should be {} <= SCALE <= {}",
+                self.scale_range[0],
+                self.scale_range[1]
+            )
+        }
+    }
+}
+
+/// Error on checking input arguments
+#[derive(Debug)]
+pub struct ArgError {
+    pub(super) func: Function,
+    pub(super) scale: Option<Scale>,
+    pub(super) arg_range: [f32; 2], // only for debug display, f32 is ok
+    pub(super) inclusive_upper_bound: bool,
+    pub(super) arg_type: ArgType,
+}
+
+#[cfg(feature = "defmt")]
+impl defmt::Format for ArgError {
+    fn format(&self, fmt: defmt::Formatter) {
+        defmt::write!(fmt, "For FUNCTION: {},", self.func);
+
+        if let Some(scale) = self.scale {
+            defmt::write!(fmt, " when SCALE is {},", scale);
+        }
+
+        let arg_string = match self.arg_type {
+            ArgType::Arg1 => "ARG1",
+            ArgType::Arg2 => "ARG2",
+        };
+
+        defmt::write!(fmt, " {} should be", arg_string);
+
+        let inclusive_string = if self.inclusive_upper_bound { "=" } else { "" };
+
+        defmt::write!(
+            fmt,
+            " {} <= {} <{} {}",
+            self.arg_range[0],
+            arg_string,
+            inclusive_string,
+            self.arg_range[1]
+        )
+    }
+}
+
+#[derive(Debug)]
+pub(super) enum ArgType {
+    Arg1,
+    Arg2,
+}
diff --git a/embassy-stm32/src/cordic/mod.rs b/embassy-stm32/src/cordic/mod.rs
index a4b98a770..5ac9addd8 100644
--- a/embassy-stm32/src/cordic/mod.rs
+++ b/embassy-stm32/src/cordic/mod.rs
@@ -1,4 +1,4 @@
-//! CORDIC co-processor
+//! coordinate rotation digital computer (CORDIC)
 
 use embassy_hal_internal::drop::OnDrop;
 use embassy_hal_internal::{into_ref, Peripheral, PeripheralRef};
@@ -8,6 +8,9 @@ use crate::{dma, peripherals};
 mod enums;
 pub use enums::*;
 
+mod errors;
+pub use errors::*;
+
 pub mod utils;
 
 pub(crate) mod sealed;
@@ -30,33 +33,55 @@ pub struct Cordic<'d, T: Instance> {
 pub trait Instance: sealed::Instance + Peripheral<P = Self> + crate::rcc::RccPeripheral {}
 
 /// CORDIC configuration
+#[derive(Debug)]
 pub struct Config {
     function: Function,
     precision: Precision,
     scale: Scale,
-    first_result: bool,
+    res1_only: bool,
 }
 
 impl Config {
     /// Create a config for Cordic driver
-    pub fn new(function: Function, precision: Option<Precision>, scale: Option<Scale>, first_result: bool) -> Self {
-        Self {
+    pub fn new(function: Function, precision: Precision, scale: Scale, res1_only: bool) -> Result<Self, CordicError> {
+        let config = Self {
             function,
-            precision: precision.unwrap_or_default(),
-            scale: scale.unwrap_or_default(),
-            first_result,
-        }
+            precision,
+            scale,
+            res1_only,
+        };
+
+        config.check_scale()?;
+
+        Ok(config)
     }
 
-    fn check_scale(&self) -> bool {
+    fn check_scale(&self) -> Result<(), CordicError> {
+        use Function::*;
+
         let scale_raw = self.scale as u8;
 
-        match self.function {
-            Function::Cos | Function::Sin | Function::Phase | Function::Modulus => 0 == scale_raw,
-            Function::Arctan => (0..=7).contains(&scale_raw),
-            Function::Cosh | Function::Sinh | Function::Arctanh => 1 == scale_raw,
-            Function::Ln => (1..=4).contains(&scale_raw),
-            Function::Sqrt => (0..=2).contains(&scale_raw),
+        let err_range = match self.function {
+            Cos | Sin | Phase | Modulus if !(0..=0).contains(&scale_raw) => Some([0, 0]),
+
+            Arctan if !(0..=7).contains(&scale_raw) => Some([0, 7]),
+
+            Cosh | Sinh | Arctanh if !(1..=1).contains(&scale_raw) => Some([1, 1]),
+
+            Ln if !(1..=4).contains(&scale_raw) => Some([1, 4]),
+
+            Sqrt if !(0..=2).contains(&scale_raw) => Some([0, 2]),
+
+            Cos | Sin | Phase | Modulus | Arctan | Cosh | Sinh | Arctanh | Ln | Sqrt => None,
+        };
+
+        if let Some(range) = err_range {
+            Err(CordicError::ConfigError(ConfigError {
+                func: self.function,
+                scale_range: range,
+            }))
+        } else {
+            Ok(())
         }
     }
 }
@@ -73,10 +98,6 @@ impl<'d, T: Instance> Cordic<'d, T> {
 
         into_ref!(peri);
 
-        if !config.check_scale() {
-            panic!("Scale value is not compatible with Function")
-        }
-
         let mut instance = Self { peri, config };
 
         instance.reconfigure();
@@ -91,21 +112,12 @@ impl<'d, T: Instance> Cordic<'d, T> {
     }
 
     /// Set extra config for data count and data width.
-    pub fn extra_config(&mut self, arg_cnt: Count, arg_width: Width, res_width: Width) {
+    pub fn extra_config(&mut self, arg_cnt: AccessCount, arg_width: Width, res_width: Width) {
         self.peri.set_argument_count(arg_cnt);
         self.peri.set_data_width(arg_width, res_width);
     }
 
     fn reconfigure(&mut self) {
-        if self.peri.ready_to_read() {
-            warn!("At least 1 result hasn't been read, reconfigure will cause DATA LOST");
-        };
-
-        // clean RRDY flag
-        while self.peri.ready_to_read() {
-            self.peri.read_result();
-        }
-
         self.peri.set_func(self.config.function);
         self.peri.set_precision(self.config.precision);
         self.peri.set_scale(self.config.scale);
@@ -113,6 +125,47 @@ impl<'d, T: Instance> Cordic<'d, T> {
         // we don't set NRES in here, but to make sure NRES is set each time user call "calc"-ish functions,
         // since each "calc"-ish functions can have different ARGSIZE and RESSIZE, thus NRES should be change accrodingly.
     }
+
+    async fn launch_a_dma_transfer(
+        &mut self,
+        write_dma: impl Peripheral<P = impl WriteDma<T>>,
+        read_dma: impl Peripheral<P = impl ReadDma<T>>,
+        input: &[u32],
+        output: &mut [u32],
+    ) {
+        into_ref!(write_dma, read_dma);
+
+        let write_req = write_dma.request();
+        let read_req = read_dma.request();
+
+        self.peri.enable_write_dma();
+        self.peri.enable_read_dma();
+
+        let _on_drop = OnDrop::new(|| {
+            self.peri.disable_write_dma();
+            self.peri.disable_read_dma();
+        });
+
+        unsafe {
+            let write_transfer = dma::Transfer::new_write(
+                &mut write_dma,
+                write_req,
+                input,
+                T::regs().wdata().as_ptr() as *mut _,
+                Default::default(),
+            );
+
+            let read_transfer = dma::Transfer::new_read(
+                &mut read_dma,
+                read_req,
+                T::regs().rdata().as_ptr() as *mut _,
+                output,
+                Default::default(),
+            );
+
+            embassy_futures::join::join(write_transfer, read_transfer).await;
+        }
+    }
 }
 
 impl<'d, T: Instance> Drop for Cordic<'d, T> {
@@ -124,25 +177,31 @@ impl<'d, T: Instance> Drop for Cordic<'d, T> {
 // q1.31 related
 impl<'d, T: Instance> Cordic<'d, T> {
     /// Run a blocking CORDIC calculation in q1.31 format
-    pub fn blocking_calc_32bit(&mut self, arg1s: &[f64], arg2s: Option<&[f64]>, output: &mut [f64]) -> usize {
+    pub fn blocking_calc_32bit(
+        &mut self,
+        arg1s: &[f64],
+        arg2s: Option<&[f64]>,
+        output: &mut [f64],
+    ) -> Result<usize, CordicError> {
         if arg1s.is_empty() {
-            return 0;
+            return Ok(0);
         }
 
-        assert!(
-            match self.config.first_result {
-                true => output.len() >= arg1s.len(),
-                false => output.len() >= 2 * arg1s.len(),
-            },
-            "Output buf length is not long enough"
-        );
+        let output_length_enough = match self.config.res1_only {
+            true => output.len() >= arg1s.len(),
+            false => output.len() >= 2 * arg1s.len(),
+        };
 
-        self.check_input_f64(arg1s, arg2s);
+        if !output_length_enough {
+            return Err(CordicError::OutputLengthNotEnough);
+        }
 
-        self.peri.set_result_count(if self.config.first_result {
-            Count::One
+        self.check_input_f64(arg1s, arg2s)?;
+
+        self.peri.set_result_count(if self.config.res1_only {
+            AccessCount::One
         } else {
-            Count::Two
+            AccessCount::Two
         });
 
         self.peri.set_data_width(Width::Bits32, Width::Bits32);
@@ -155,10 +214,10 @@ impl<'d, T: Instance> Cordic<'d, T> {
         // handle 2 input args calculation
         //
 
-        if arg2s.is_some() && !arg2s.expect("It's infailable").is_empty() {
-            let arg2s = arg2s.expect("It's infailable");
+        if arg2s.is_some() && !arg2s.unwrap().is_empty() {
+            let arg2s = arg2s.unwrap();
 
-            self.peri.set_argument_count(Count::Two);
+            self.peri.set_argument_count(AccessCount::Two);
 
             // Skip 1st value from arg1s, this value will be manually "preload" to cordic, to make use of cordic preload function.
             // And we preserve last value from arg2s, since it need to manually write to cordic, and read the result out.
@@ -191,7 +250,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
         let input_left = &arg1s[consumed_input_len..];
 
         if !input_left.is_empty() {
-            self.peri.set_argument_count(Count::One);
+            self.peri.set_argument_count(AccessCount::One);
 
             // "preload" value to cordic (at this moment, cordic start to calculating)
             self.blocking_write_f64(input_left[0]);
@@ -207,7 +266,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
             self.blocking_read_f64_to_buf(output, &mut output_count);
         }
 
-        output_count
+        Ok(output_count)
     }
 
     fn blocking_read_f64_to_buf(&mut self, result_buf: &mut [f64], result_index: &mut usize) {
@@ -216,7 +275,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
 
         // We don't care about whether the function return 1 or 2 results,
         // the only thing matter is whether user want 1 or 2 results.
-        if !self.config.first_result {
+        if !self.config.res1_only {
             result_buf[*result_index] = utils::q1_31_to_f64(self.peri.read_result());
             *result_index += 1;
         }
@@ -234,27 +293,28 @@ impl<'d, T: Instance> Cordic<'d, T> {
         arg1s: &[f64],
         arg2s: Option<&[f64]>,
         output: &mut [f64],
-    ) -> usize {
+    ) -> Result<usize, CordicError> {
         if arg1s.is_empty() {
-            return 0;
+            return Ok(0);
         }
 
-        assert!(
-            match self.config.first_result {
-                true => output.len() >= arg1s.len(),
-                false => output.len() >= 2 * arg1s.len(),
-            },
-            "Output buf length is not long enough"
-        );
+        let output_length_enough = match self.config.res1_only {
+            true => output.len() >= arg1s.len(),
+            false => output.len() >= 2 * arg1s.len(),
+        };
 
-        self.check_input_f64(arg1s, arg2s);
+        if !output_length_enough {
+            return Err(CordicError::OutputLengthNotEnough);
+        }
+
+        self.check_input_f64(arg1s, arg2s)?;
 
         into_ref!(write_dma, read_dma);
 
-        self.peri.set_result_count(if self.config.first_result {
-            Count::One
+        self.peri.set_result_count(if self.config.res1_only {
+            AccessCount::One
         } else {
-            Count::Two
+            AccessCount::Two
         });
 
         self.peri.set_data_width(Width::Bits32, Width::Bits32);
@@ -269,9 +329,9 @@ impl<'d, T: Instance> Cordic<'d, T> {
         //
 
         if !arg2s.unwrap_or_default().is_empty() {
-            let arg2s = arg2s.expect("It's infailable");
+            let arg2s = arg2s.unwrap();
 
-            self.peri.set_argument_count(Count::Two);
+            self.peri.set_argument_count(AccessCount::Two);
 
             let double_input = arg1s.iter().zip(arg2s);
 
@@ -320,7 +380,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
         if arg1s.len() > consumed_input_len {
             let input_remain = &arg1s[consumed_input_len..];
 
-            self.peri.set_argument_count(Count::One);
+            self.peri.set_argument_count(AccessCount::One);
 
             for &arg in input_remain {
                 input_buf[input_buf_len] = utils::f64_to_q1_31(arg);
@@ -356,7 +416,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
             }
         }
 
-        output_count
+        Ok(output_count)
     }
 
     // this function is highly coupled with async_calc_32bit, and is not intended to use in other place
@@ -369,11 +429,6 @@ impl<'d, T: Instance> Cordic<'d, T> {
         output: &mut [f64],             // caller uses should this as a final output array
         output_start_index: &mut usize, // the index of start point of the output for this round of calculation
     ) {
-        into_ref!(write_dma, read_dma);
-
-        let write_req = write_dma.request();
-        let read_req = read_dma.request();
-
         // output_buf is the place to store raw value from CORDIC (via DMA).
         // For buf size, we assume in this round of calculation:
         // all input is 1 arg, and all calculation need 2 output,
@@ -381,7 +436,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
         let mut output_buf = [0u32; INPUT_BUF_MAX_LEN * 2];
 
         let mut output_buf_size = input_buf.len();
-        if !self.config.first_result {
+        if !self.config.res1_only {
             // if we need 2 result for 1 input, then output_buf length should be 2x long.
             output_buf_size *= 2;
         };
@@ -392,35 +447,8 @@ impl<'d, T: Instance> Cordic<'d, T> {
 
         let active_output_buf = &mut output_buf[..output_buf_size];
 
-        self.peri.enable_write_dma();
-        self.peri.enable_read_dma();
-
-        let on_drop = OnDrop::new(|| {
-            self.peri.disable_write_dma();
-            self.peri.disable_read_dma();
-        });
-
-        unsafe {
-            let write_transfer = dma::Transfer::new_write(
-                &mut write_dma,
-                write_req,
-                input_buf,
-                T::regs().wdata().as_ptr() as *mut _,
-                Default::default(),
-            );
-
-            let read_transfer = dma::Transfer::new_read(
-                &mut read_dma,
-                read_req,
-                T::regs().rdata().as_ptr() as *mut _,
-                active_output_buf,
-                Default::default(),
-            );
-
-            embassy_futures::join::join(write_transfer, read_transfer).await;
-        }
-
-        drop(on_drop);
+        self.launch_a_dma_transfer(write_dma, read_dma, input_buf, active_output_buf)
+            .await;
 
         for &mut output_u32 in active_output_buf {
             output[*output_start_index] = utils::q1_31_to_f64(output_u32);
@@ -432,24 +460,30 @@ impl<'d, T: Instance> Cordic<'d, T> {
 // q1.15 related
 impl<'d, T: Instance> Cordic<'d, T> {
     /// Run a blocking CORDIC calculation in q1.15 format
-    pub fn blocking_calc_16bit(&mut self, arg1s: &[f32], arg2s: Option<&[f32]>, output: &mut [f32]) -> usize {
+    pub fn blocking_calc_16bit(
+        &mut self,
+        arg1s: &[f32],
+        arg2s: Option<&[f32]>,
+        output: &mut [f32],
+    ) -> Result<usize, CordicError> {
         if arg1s.is_empty() {
-            return 0;
+            return Ok(0);
         }
 
-        assert!(
-            match self.config.first_result {
-                true => output.len() >= arg1s.len(),
-                false => output.len() >= 2 * arg1s.len(),
-            },
-            "Output buf length is not long enough"
-        );
+        let output_length_enough = match self.config.res1_only {
+            true => output.len() >= arg1s.len(),
+            false => output.len() >= 2 * arg1s.len(),
+        };
 
-        self.check_input_f32(arg1s, arg2s);
+        if !output_length_enough {
+            return Err(CordicError::OutputLengthNotEnough);
+        }
+
+        self.check_input_f32(arg1s, arg2s)?;
 
         // In q1.15 mode, 1 write/read to access 2 arguments/results
-        self.peri.set_argument_count(Count::One);
-        self.peri.set_result_count(Count::One);
+        self.peri.set_argument_count(AccessCount::One);
+        self.peri.set_result_count(AccessCount::One);
 
         self.peri.set_data_width(Width::Bits16, Width::Bits16);
 
@@ -472,9 +506,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
                 .chain(core::iter::repeat(&arg2_default_value)),
         );
 
-        let (&arg1, &arg2) = args
-            .next()
-            .expect("This should be infallible, since arg1s is not empty");
+        let (&arg1, &arg2) = args.next().unwrap();
 
         // preloading 1 pair of arguments
         self.blocking_write_f32(arg1, arg2);
@@ -487,7 +519,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
         // read last pair of value from cordic
         self.blocking_read_f32_to_buf(output, &mut output_count);
 
-        output_count
+        Ok(output_count)
     }
 
     fn blocking_write_f32(&mut self, arg1: f32, arg2: f32) {
@@ -505,7 +537,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
 
         // We don't care about whether the function return 1 or 2 results,
         // the only thing matter is whether user want 1 or 2 results.
-        if !self.config.first_result {
+        if !self.config.res1_only {
             result_buf[*result_index] = res2;
             *result_index += 1;
         }
@@ -519,26 +551,27 @@ impl<'d, T: Instance> Cordic<'d, T> {
         arg1s: &[f32],
         arg2s: Option<&[f32]>,
         output: &mut [f32],
-    ) -> usize {
+    ) -> Result<usize, CordicError> {
         if arg1s.is_empty() {
-            return 0;
+            return Ok(0);
         }
 
-        assert!(
-            match self.config.first_result {
-                true => output.len() >= arg1s.len(),
-                false => output.len() >= 2 * arg1s.len(),
-            },
-            "Output buf length is not long enough"
-        );
+        let output_length_enough = match self.config.res1_only {
+            true => output.len() >= arg1s.len(),
+            false => output.len() >= 2 * arg1s.len(),
+        };
 
-        self.check_input_f32(arg1s, arg2s);
+        if !output_length_enough {
+            return Err(CordicError::OutputLengthNotEnough);
+        }
+
+        self.check_input_f32(arg1s, arg2s)?;
 
         into_ref!(write_dma, read_dma);
 
         // In q1.15 mode, 1 write/read to access 2 arguments/results
-        self.peri.set_argument_count(Count::One);
-        self.peri.set_result_count(Count::One);
+        self.peri.set_argument_count(AccessCount::One);
+        self.peri.set_result_count(AccessCount::One);
 
         self.peri.set_data_width(Width::Bits16, Width::Bits16);
 
@@ -584,7 +617,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
             .await;
         }
 
-        output_count
+        Ok(output_count)
     }
 
     // this function is highly coupled with async_calc_16bit, and is not intended to use in other place
@@ -596,45 +629,13 @@ impl<'d, T: Instance> Cordic<'d, T> {
         output: &mut [f32], // caller uses should this as a final output array
         output_start_index: &mut usize, // the index of start point of the output for this round of calculation
     ) {
-        into_ref!(write_dma, read_dma);
-
-        let write_req = write_dma.request();
-        let read_req = read_dma.request();
-
         // output_buf is the place to store raw value from CORDIC (via DMA).
         let mut output_buf = [0u32; INPUT_BUF_MAX_LEN];
 
         let active_output_buf = &mut output_buf[..input_buf.len()];
 
-        self.peri.enable_write_dma();
-        self.peri.enable_read_dma();
-
-        let on_drop = OnDrop::new(|| {
-            self.peri.disable_write_dma();
-            self.peri.disable_read_dma();
-        });
-
-        unsafe {
-            let write_transfer = dma::Transfer::new_write(
-                &mut write_dma,
-                write_req,
-                input_buf,
-                T::regs().wdata().as_ptr() as *mut _,
-                Default::default(),
-            );
-
-            let read_transfer = dma::Transfer::new_read(
-                &mut read_dma,
-                read_req,
-                T::regs().rdata().as_ptr() as *mut _,
-                active_output_buf,
-                Default::default(),
-            );
-
-            embassy_futures::join::join(write_transfer, read_transfer).await;
-        }
-
-        drop(on_drop);
+        self.launch_a_dma_transfer(write_dma, read_dma, input_buf, active_output_buf)
+            .await;
 
         for &mut output_u32 in active_output_buf {
             let (res1, res2) = utils::u32_to_f32_res(output_u32);
@@ -642,7 +643,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
             output[*output_start_index] = res1;
             *output_start_index += 1;
 
-            if !self.config.first_result {
+            if !self.config.res1_only {
                 output[*output_start_index] = res2;
                 *output_start_index += 1;
             }
@@ -654,104 +655,131 @@ impl<'d, T: Instance> Cordic<'d, T> {
 macro_rules! check_input_value {
     ($func_name:ident, $float_type:ty) => {
         impl<'d, T: Instance> Cordic<'d, T> {
-            fn $func_name(&self, arg1s: &[$float_type], arg2s: Option<&[$float_type]>) {
+            fn $func_name(&self, arg1s: &[$float_type], arg2s: Option<&[$float_type]>) -> Result<(), CordicError> {
                 let config = &self.config;
 
                 use Function::*;
 
-                // check SCALE value
-                match config.function {
-                    Cos | Sin | Phase | Modulus => assert!(Scale::A1_R1 == config.scale, "SCALE should be 0"),
-                    Arctan => assert!(
-                        (0..=7).contains(&(config.scale as u8)),
-                        "SCALE should be: 0 <= SCALE <= 7"
-                    ),
-                    Cosh | Sinh | Arctanh => assert!(Scale::A1o2_R2 == config.scale, "SCALE should be 1"),
-
-                    Ln => assert!(
-                        (1..=4).contains(&(config.scale as u8)),
-                        "SCALE should be: 1 <= SCALE <= 4"
-                    ),
-                    Sqrt => assert!(
-                        (0..=2).contains(&(config.scale as u8)),
-                        "SCALE should be: 0 <= SCALE <= 2"
-                    ),
+                struct Arg1ErrInfo {
+                    scale: Option<Scale>,
+                    range: [f32; 2],
+                    inclusive_upper_bound: bool,
                 }
 
                 // check ARG1 value
-                match config.function {
-                    Cos | Sin | Phase | Modulus | Arctan => {
-                        assert!(
-                            arg1s.iter().all(|v| (-1.0..=1.0).contains(v)),
-                            "ARG1 should be: -1 <= ARG1 <= 1"
-                        );
+                let err_info = match config.function {
+                    Cos | Sin | Phase | Modulus | Arctan if arg1s.iter().any(|v| !(-1.0..=1.0).contains(v)) => {
+                        Some(Arg1ErrInfo {
+                            scale: None,
+                            range: [-1.0, 1.0],
+                            inclusive_upper_bound: true,
+                        })
                     }
 
-                    Cosh | Sinh => assert!(
-                        arg1s.iter().all(|v| (-0.559..=0.559).contains(v)),
-                        "ARG1 should be: -0.559 <= ARG1 <= 0.559"
-                    ),
+                    Cosh | Sinh if arg1s.iter().any(|v| !(-0.559..=0.559).contains(v)) => Some(Arg1ErrInfo {
+                        scale: None,
+                        range: [-0.559, 0.559],
+                        inclusive_upper_bound: true,
+                    }),
 
-                    Arctanh => assert!(
-                        arg1s.iter().all(|v| (-0.403..=0.403).contains(v)),
-                        "ARG1 should be: -0.403 <= ARG1 <= 0.403"
-                    ),
+                    Arctanh if arg1s.iter().any(|v| !(-0.403..=0.403).contains(v)) => Some(Arg1ErrInfo {
+                        scale: None,
+                        range: [-0.403, 0.403],
+                        inclusive_upper_bound: true,
+                    }),
 
-                    Ln => {
-                        match config.scale {
-                            Scale::A1o2_R2 => assert!(
-                                arg1s.iter().all(|v| (0.05354..0.5).contains(v)),
-                                "When SCALE set to 1, ARG1 should be: 0.05354 <= ARG1 < 0.5"
-                            ),
-                            Scale::A1o4_R4 => assert!(
-                                arg1s.iter().all(|v| (0.25..0.75).contains(v)),
-                                "When SCALE set to 2, ARG1 should be: 0.25 <= ARG1 < 0.75"
-                            ),
-                            Scale::A1o8_R8 => assert!(
-                                arg1s.iter().all(|v| (0.375..0.875).contains(v)),
-                                "When SCALE set to 3, ARG1 should be: 0.375 <= ARG1 < 0.875"
-                            ),
-                            Scale::A1o16_R16 => assert!(
-                                arg1s.iter().all(|v| (0.4375..0.584).contains(v)),
-                                "When SCALE set to 4, ARG1 should be: 0.4375 <= ARG1 < 0.584"
-                            ),
-                            _ => unreachable!(),
-                        };
-                    }
+                    Ln => match config.scale {
+                        Scale::Arg1o2Res2 if arg1s.iter().any(|v| !(0.0535..0.5).contains(v)) => Some(Arg1ErrInfo {
+                            scale: Some(Scale::Arg1o2Res2),
+                            range: [0.0535, 0.5],
+                            inclusive_upper_bound: false,
+                        }),
+                        Scale::Arg1o4Res4 if arg1s.iter().any(|v| !(0.25..0.75).contains(v)) => Some(Arg1ErrInfo {
+                            scale: Some(Scale::Arg1o4Res4),
+                            range: [0.25, 0.75],
+                            inclusive_upper_bound: false,
+                        }),
+                        Scale::Arg1o8Res8 if arg1s.iter().any(|v| !(0.375..0.875).contains(v)) => Some(Arg1ErrInfo {
+                            scale: Some(Scale::Arg1o8Res8),
+                            range: [0.375, 0.875],
+                            inclusive_upper_bound: false,
+                        }),
+                        Scale::Arg1o16Res16 if arg1s.iter().any(|v| !(0.4375..0.584).contains(v)) => {
+                            Some(Arg1ErrInfo {
+                                scale: Some(Scale::Arg1o16Res16),
+                                range: [0.4375, 0.584],
+                                inclusive_upper_bound: false,
+                            })
+                        }
+
+                        Scale::Arg1o2Res2 | Scale::Arg1o4Res4 | Scale::Arg1o8Res8 | Scale::Arg1o16Res16 => None,
 
-                    Sqrt => match config.scale {
-                        Scale::A1_R1 => assert!(
-                            arg1s.iter().all(|v| (0.027..0.75).contains(v)),
-                            "When SCALE set to 0, ARG1 should be: 0.027 <= ARG1 < 0.75"
-                        ),
-                        Scale::A1o2_R2 => assert!(
-                            arg1s.iter().all(|v| (0.375..0.875).contains(v)),
-                            "When SCALE set to 1, ARG1 should be: 0.375 <= ARG1 < 0.875"
-                        ),
-                        Scale::A1o4_R4 => assert!(
-                            arg1s.iter().all(|v| (0.4375..0.585).contains(v)),
-                            "When SCALE set to 2, ARG1 should be: 0.4375  <= ARG1 < 0.585"
-                        ),
                         _ => unreachable!(),
                     },
+
+                    Sqrt => match config.scale {
+                        Scale::Arg1Res1 if arg1s.iter().any(|v| !(0.027..0.75).contains(v)) => Some(Arg1ErrInfo {
+                            scale: Some(Scale::Arg1Res1),
+                            range: [0.027, 0.75],
+                            inclusive_upper_bound: false,
+                        }),
+                        Scale::Arg1o2Res2 if arg1s.iter().any(|v| !(0.375..0.875).contains(v)) => Some(Arg1ErrInfo {
+                            scale: Some(Scale::Arg1o2Res2),
+                            range: [0.375, 0.875],
+                            inclusive_upper_bound: false,
+                        }),
+                        Scale::Arg1o4Res4 if arg1s.iter().any(|v| !(0.4375..0.584).contains(v)) => Some(Arg1ErrInfo {
+                            scale: Some(Scale::Arg1o4Res4),
+                            range: [0.4375, 0.584],
+                            inclusive_upper_bound: false,
+                        }),
+                        Scale::Arg1Res1 | Scale::Arg1o2Res2 | Scale::Arg1o4Res4 => None,
+                        _ => unreachable!(),
+                    },
+
+                    Cos | Sin | Phase | Modulus | Arctan | Cosh | Sinh | Arctanh => None,
+                };
+
+                if let Some(err) = err_info {
+                    return Err(CordicError::ArgError(ArgError {
+                        func: config.function,
+                        scale: err.scale,
+                        arg_range: err.range,
+                        inclusive_upper_bound: err.inclusive_upper_bound,
+                        arg_type: ArgType::Arg1,
+                    }));
                 }
 
                 // check ARG2 value
                 if let Some(arg2s) = arg2s {
-                    match config.function {
-                        Cos | Sin => assert!(
-                            arg2s.iter().all(|v| (0.0..=1.0).contains(v)),
-                            "ARG2 should be: 0 <= ARG2 <= 1"
-                        ),
+                    struct Arg2ErrInfo {
+                        range: [f32; 2],
+                    }
 
-                        Phase | Modulus => assert!(
-                            arg2s.iter().all(|v| (-1.0..=1.0).contains(v)),
-                            "ARG2 should be: -1 <= ARG2 <= 1"
-                        ),
+                    let err_info = match config.function {
+                        Cos | Sin if arg2s.iter().any(|v| !(0.0..=1.0).contains(v)) => {
+                            Some(Arg2ErrInfo { range: [0.0, 1.0] })
+                        }
 
-                        _ => (),
+                        Phase | Modulus if arg2s.iter().any(|v| !(-1.0..=1.0).contains(v)) => {
+                            Some(Arg2ErrInfo { range: [-1.0, 1.0] })
+                        }
+
+                        Cos | Sin | Phase | Modulus | Arctan | Cosh | Sinh | Arctanh | Ln | Sqrt => None,
+                    };
+
+                    if let Some(err) = err_info {
+                        return Err(CordicError::ArgError(ArgError {
+                            func: config.function,
+                            scale: None,
+                            arg_range: err.range,
+                            inclusive_upper_bound: true,
+                            arg_type: ArgType::Arg2,
+                        }));
                     }
                 }
+
+                Ok(())
             }
         }
     };
diff --git a/embassy-stm32/src/cordic/sealed.rs b/embassy-stm32/src/cordic/sealed.rs
index 0f00e380c..f9521ff7a 100644
--- a/embassy-stm32/src/cordic/sealed.rs
+++ b/embassy-stm32/src/cordic/sealed.rs
@@ -66,21 +66,21 @@ pub trait Instance {
     }
 
     /// Set NARGS value
-    fn set_argument_count(&self, n: Count) {
+    fn set_argument_count(&self, n: AccessCount) {
         Self::regs().csr().modify(|v| {
             v.set_nargs(match n {
-                Count::One => vals::Num::NUM1,
-                Count::Two => vals::Num::NUM2,
+                AccessCount::One => vals::Num::NUM1,
+                AccessCount::Two => vals::Num::NUM2,
             })
         })
     }
 
     /// Set NRES value
-    fn set_result_count(&self, n: Count) {
+    fn set_result_count(&self, n: AccessCount) {
         Self::regs().csr().modify(|v| {
             v.set_nres(match n {
-                Count::One => vals::Num::NUM1,
-                Count::Two => vals::Num::NUM2,
+                AccessCount::One => vals::Num::NUM1,
+                AccessCount::Two => vals::Num::NUM2,
             });
         })
     }

From c42d9f9eaae546faae46c4d1121f1fbc393c2073 Mon Sep 17 00:00:00 2001
From: eZio Pan <eziopan@qq.com>
Date: Thu, 21 Mar 2024 13:25:40 +0800
Subject: [PATCH 09/17] stm32 CORDIC: bug fix

---
 embassy-stm32/src/cordic/errors.rs | 76 +++++++++++++++++++++++-------
 embassy-stm32/src/cordic/mod.rs    | 53 ++++++++++-----------
 embassy-stm32/src/cordic/utils.rs  | 43 +++++++++--------
 3 files changed, 109 insertions(+), 63 deletions(-)

diff --git a/embassy-stm32/src/cordic/errors.rs b/embassy-stm32/src/cordic/errors.rs
index d0b2dc618..2c0aca4a2 100644
--- a/embassy-stm32/src/cordic/errors.rs
+++ b/embassy-stm32/src/cordic/errors.rs
@@ -9,6 +9,26 @@ pub enum CordicError {
     ArgError(ArgError),
     /// Output buffer length error
     OutputLengthNotEnough,
+    /// Input value is out of range for Q1.x format
+    NumberOutOfRange(NumberOutOfRange),
+}
+
+impl From<ConfigError> for CordicError {
+    fn from(value: ConfigError) -> Self {
+        Self::ConfigError(value)
+    }
+}
+
+impl From<ArgError> for CordicError {
+    fn from(value: ArgError) -> Self {
+        Self::ArgError(value)
+    }
+}
+
+impl From<NumberOutOfRange> for CordicError {
+    fn from(value: NumberOutOfRange) -> Self {
+        Self::NumberOutOfRange(value)
+    }
 }
 
 #[cfg(feature = "defmt")]
@@ -19,6 +39,7 @@ impl defmt::Format for CordicError {
         match self {
             ConfigError(e) => defmt::write!(fmt, "{}", e),
             ArgError(e) => defmt::write!(fmt, "{}", e),
+            NumberOutOfRange(e) => defmt::write!(fmt, "{}", e),
             OutputLengthNotEnough => defmt::write!(fmt, "Output buffer length is not long enough"),
         }
     }
@@ -68,28 +89,51 @@ impl defmt::Format for ArgError {
             defmt::write!(fmt, " when SCALE is {},", scale);
         }
 
-        let arg_string = match self.arg_type {
-            ArgType::Arg1 => "ARG1",
-            ArgType::Arg2 => "ARG2",
+        defmt::write!(fmt, " {} should be", self.arg_type);
+
+        if self.inclusive_upper_bound {
+            defmt::write!(
+                fmt,
+                " {} <= {} <= {}",
+                self.arg_range[0],
+                self.arg_type,
+                self.arg_range[1]
+            )
+        } else {
+            defmt::write!(
+                fmt,
+                " {} <= {} < {}",
+                self.arg_range[0],
+                self.arg_type,
+                self.arg_range[1]
+            )
         };
-
-        defmt::write!(fmt, " {} should be", arg_string);
-
-        let inclusive_string = if self.inclusive_upper_bound { "=" } else { "" };
-
-        defmt::write!(
-            fmt,
-            " {} <= {} <{} {}",
-            self.arg_range[0],
-            arg_string,
-            inclusive_string,
-            self.arg_range[1]
-        )
     }
 }
 
 #[derive(Debug)]
+#[cfg_attr(feature = "defmt", derive(defmt::Format))]
 pub(super) enum ArgType {
     Arg1,
     Arg2,
 }
+
+/// Input value is out of range for Q1.x format
+#[allow(missing_docs)]
+#[derive(Debug)]
+pub enum NumberOutOfRange {
+    BelowLowerBound,
+    AboveUpperBound,
+}
+
+#[cfg(feature = "defmt")]
+impl defmt::Format for NumberOutOfRange {
+    fn format(&self, fmt: defmt::Formatter) {
+        use NumberOutOfRange::*;
+
+        match self {
+            BelowLowerBound => defmt::write!(fmt, "input value should be equal or greater than -1"),
+            AboveUpperBound => defmt::write!(fmt, "input value should be equal or less than 1"),
+        }
+    }
+}
diff --git a/embassy-stm32/src/cordic/mod.rs b/embassy-stm32/src/cordic/mod.rs
index 5ac9addd8..b0db3f060 100644
--- a/embassy-stm32/src/cordic/mod.rs
+++ b/embassy-stm32/src/cordic/mod.rs
@@ -56,7 +56,7 @@ impl Config {
         Ok(config)
     }
 
-    fn check_scale(&self) -> Result<(), CordicError> {
+    fn check_scale(&self) -> Result<(), ConfigError> {
         use Function::*;
 
         let scale_raw = self.scale as u8;
@@ -76,10 +76,10 @@ impl Config {
         };
 
         if let Some(range) = err_range {
-            Err(CordicError::ConfigError(ConfigError {
+            Err(ConfigError {
                 func: self.function,
                 scale_range: range,
-            }))
+            })
         } else {
             Ok(())
         }
@@ -226,20 +226,20 @@ impl<'d, T: Instance> Cordic<'d, T> {
             consumed_input_len = double_input.len() + 1;
 
             // preload first value from arg1 to cordic
-            self.blocking_write_f64(arg1s[0]);
+            self.blocking_write_f64(arg1s[0])?;
 
             for (&arg1, &arg2) in double_input {
                 // Since we manually preload a value before,
                 // we will write arg2 (from the actual last pair) first, (at this moment, cordic start to calculating,)
                 // and write arg1 (from the actual next pair), then read the result, to "keep preloading"
 
-                self.blocking_write_f64(arg2);
-                self.blocking_write_f64(arg1);
+                self.blocking_write_f64(arg2)?;
+                self.blocking_write_f64(arg1)?;
                 self.blocking_read_f64_to_buf(output, &mut output_count);
             }
 
             // write last input value from arg2s, then read out the result
-            self.blocking_write_f64(arg2s[arg2s.len() - 1]);
+            self.blocking_write_f64(arg2s[arg2s.len() - 1])?;
             self.blocking_read_f64_to_buf(output, &mut output_count);
         }
 
@@ -253,12 +253,12 @@ impl<'d, T: Instance> Cordic<'d, T> {
             self.peri.set_argument_count(AccessCount::One);
 
             // "preload" value to cordic (at this moment, cordic start to calculating)
-            self.blocking_write_f64(input_left[0]);
+            self.blocking_write_f64(input_left[0])?;
 
             for &arg in input_left.iter().skip(1) {
                 // this line write arg for next round caculation to cordic,
                 // and read result from last round
-                self.blocking_write_f64(arg);
+                self.blocking_write_f64(arg)?;
                 self.blocking_read_f64_to_buf(output, &mut output_count);
             }
 
@@ -281,8 +281,9 @@ impl<'d, T: Instance> Cordic<'d, T> {
         }
     }
 
-    fn blocking_write_f64(&mut self, arg: f64) {
-        self.peri.write_argument(utils::f64_to_q1_31(arg));
+    fn blocking_write_f64(&mut self, arg: f64) -> Result<(), NumberOutOfRange> {
+        self.peri.write_argument(utils::f64_to_q1_31(arg)?);
+        Ok(())
     }
 
     /// Run a async CORDIC calculation in q.1.31 format
@@ -339,7 +340,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
 
             for (&arg1, &arg2) in double_input {
                 for &arg in [arg1, arg2].iter() {
-                    input_buf[input_buf_len] = utils::f64_to_q1_31(arg);
+                    input_buf[input_buf_len] = utils::f64_to_q1_31(arg)?;
                     input_buf_len += 1;
                 }
 
@@ -383,7 +384,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
             self.peri.set_argument_count(AccessCount::One);
 
             for &arg in input_remain {
-                input_buf[input_buf_len] = utils::f64_to_q1_31(arg);
+                input_buf[input_buf_len] = utils::f64_to_q1_31(arg)?;
                 input_buf_len += 1;
 
                 if input_buf_len == INPUT_BUF_MAX_LEN {
@@ -509,10 +510,10 @@ impl<'d, T: Instance> Cordic<'d, T> {
         let (&arg1, &arg2) = args.next().unwrap();
 
         // preloading 1 pair of arguments
-        self.blocking_write_f32(arg1, arg2);
+        self.blocking_write_f32(arg1, arg2)?;
 
         for (&arg1, &arg2) in args {
-            self.blocking_write_f32(arg1, arg2);
+            self.blocking_write_f32(arg1, arg2)?;
             self.blocking_read_f32_to_buf(output, &mut output_count);
         }
 
@@ -522,15 +523,13 @@ impl<'d, T: Instance> Cordic<'d, T> {
         Ok(output_count)
     }
 
-    fn blocking_write_f32(&mut self, arg1: f32, arg2: f32) {
-        let reg_value: u32 = utils::f32_args_to_u32(arg1, arg2);
-        self.peri.write_argument(reg_value);
+    fn blocking_write_f32(&mut self, arg1: f32, arg2: f32) -> Result<(), NumberOutOfRange> {
+        self.peri.write_argument(utils::f32_args_to_u32(arg1, arg2)?);
+        Ok(())
     }
 
     fn blocking_read_f32_to_buf(&mut self, result_buf: &mut [f32], result_index: &mut usize) {
-        let reg_value = self.peri.read_result();
-
-        let (res1, res2) = utils::u32_to_f32_res(reg_value);
+        let (res1, res2) = utils::u32_to_f32_res(self.peri.read_result());
 
         result_buf[*result_index] = res1;
         *result_index += 1;
@@ -597,7 +596,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
         );
 
         for (&arg1, &arg2) in args {
-            input_buf[input_buf_len] = utils::f32_args_to_u32(arg1, arg2);
+            input_buf[input_buf_len] = utils::f32_args_to_u32(arg1, arg2)?;
             input_buf_len += 1;
 
             if input_buf_len == INPUT_BUF_MAX_LEN {
@@ -655,7 +654,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
 macro_rules! check_input_value {
     ($func_name:ident, $float_type:ty) => {
         impl<'d, T: Instance> Cordic<'d, T> {
-            fn $func_name(&self, arg1s: &[$float_type], arg2s: Option<&[$float_type]>) -> Result<(), CordicError> {
+            fn $func_name(&self, arg1s: &[$float_type], arg2s: Option<&[$float_type]>) -> Result<(), ArgError> {
                 let config = &self.config;
 
                 use Function::*;
@@ -741,13 +740,13 @@ macro_rules! check_input_value {
                 };
 
                 if let Some(err) = err_info {
-                    return Err(CordicError::ArgError(ArgError {
+                    return Err(ArgError {
                         func: config.function,
                         scale: err.scale,
                         arg_range: err.range,
                         inclusive_upper_bound: err.inclusive_upper_bound,
                         arg_type: ArgType::Arg1,
-                    }));
+                    });
                 }
 
                 // check ARG2 value
@@ -769,13 +768,13 @@ macro_rules! check_input_value {
                     };
 
                     if let Some(err) = err_info {
-                        return Err(CordicError::ArgError(ArgError {
+                        return Err(ArgError {
                             func: config.function,
                             scale: None,
                             arg_range: err.range,
                             inclusive_upper_bound: true,
                             arg_type: ArgType::Arg2,
-                        }));
+                        });
                     }
                 }
 
diff --git a/embassy-stm32/src/cordic/utils.rs b/embassy-stm32/src/cordic/utils.rs
index 79bef6b97..3c3ed224f 100644
--- a/embassy-stm32/src/cordic/utils.rs
+++ b/embassy-stm32/src/cordic/utils.rs
@@ -1,39 +1,42 @@
 //! Common match utils
+use super::errors::NumberOutOfRange;
 
 macro_rules! floating_fixed_convert {
     ($f_to_q:ident, $q_to_f:ident, $unsigned_bin_typ:ty, $signed_bin_typ:ty, $float_ty:ty, $offset:literal, $min_positive:literal) => {
         /// convert float point to fixed point format
-        pub(crate) fn $f_to_q(value: $float_ty) -> $unsigned_bin_typ {
+        pub fn $f_to_q(value: $float_ty) -> Result<$unsigned_bin_typ, NumberOutOfRange> {
             const MIN_POSITIVE: $float_ty = unsafe { core::mem::transmute($min_positive) };
 
-            assert!(
-                (-1.0 as $float_ty) <= value,
-                "input value {} should be equal or greater than -1",
-                value
-            );
+            if value < -1.0 {
+                return Err(NumberOutOfRange::BelowLowerBound)
+            }
+
+            if value > 1.0 {
+                return Err(NumberOutOfRange::AboveUpperBound)
+            }
 
 
-            let value = if value == 1.0 as $float_ty{
-                // make a exception for user specifing exact 1.0 float point,
-                // convert 1.0 to max representable value of q1.x format
+            let value = if 1.0 - MIN_POSITIVE < value && value <= 1.0 {
+                // make a exception for value between (1.0^{-x} , 1.0] float point,
+                // convert it to max representable value of q1.x format
                 (1.0 as $float_ty) - MIN_POSITIVE
             } else {
-                assert!(
-                    value <= (1.0 as $float_ty) - MIN_POSITIVE,
-                    "input value {} should be equal or less than 1-2^(-{})",
-                    value, $offset
-                );
                 value
             };
 
-            (value * ((1 as $unsigned_bin_typ << $offset) as $float_ty)) as $unsigned_bin_typ
+            // It's necessary to cast the float value to signed integer, before convert it to a unsigned value.
+            // Since value from register is actually a "signed value", a "as" cast will keep original binary format but mark it as unsgined value.
+            // see https://doc.rust-lang.org/reference/expressions/operator-expr.html#numeric-cast
+            Ok((value * ((1 as $unsigned_bin_typ << $offset) as $float_ty)) as $signed_bin_typ as $unsigned_bin_typ)
         }
 
         #[inline(always)]
         /// convert fixed point to float point format
-        pub(crate) fn $q_to_f(value: $unsigned_bin_typ) -> $float_ty {
-            // It's needed to convert from unsigned to signed first, for correct result.
-            -(value as $signed_bin_typ as $float_ty) / ((1 as $unsigned_bin_typ << $offset) as $float_ty)
+        pub fn $q_to_f(value: $unsigned_bin_typ) -> $float_ty {
+            // It's necessary to cast the unsigned integer to signed integer, before convert it to a float value.
+            // Since value from register is actually a "signed value", a "as" cast will keep original binary format but mark it as signed value.
+            // see https://doc.rust-lang.org/reference/expressions/operator-expr.html#numeric-cast
+            (value as $signed_bin_typ as $float_ty) / ((1 as $unsigned_bin_typ << $offset) as $float_ty)
         }
     };
 }
@@ -59,8 +62,8 @@ floating_fixed_convert!(
 );
 
 #[inline(always)]
-pub(crate) fn f32_args_to_u32(arg1: f32, arg2: f32) -> u32 {
-    f32_to_q1_15(arg1) as u32 + ((f32_to_q1_15(arg2) as u32) << 16)
+pub(crate) fn f32_args_to_u32(arg1: f32, arg2: f32) -> Result<u32, NumberOutOfRange> {
+    Ok(f32_to_q1_15(arg1)? as u32 + ((f32_to_q1_15(arg2)? as u32) << 16))
 }
 
 #[inline(always)]

From 0d065ab2d658ebfad0c6e4bba562e474d6ca1012 Mon Sep 17 00:00:00 2001
From: eZio Pan <eziopan@qq.com>
Date: Thu, 21 Mar 2024 16:06:34 +0800
Subject: [PATCH 10/17] stm32 CORDIC: add HIL test

---
 embassy-stm32/src/cordic/errors.rs |   2 +
 tests/stm32/Cargo.toml             |  13 ++-
 tests/stm32/gen_test.py            |   2 +-
 tests/stm32/src/bin/cordic.rs      | 152 +++++++++++++++++++++++++++++
 4 files changed, 165 insertions(+), 4 deletions(-)
 create mode 100644 tests/stm32/src/bin/cordic.rs

diff --git a/embassy-stm32/src/cordic/errors.rs b/embassy-stm32/src/cordic/errors.rs
index 2c0aca4a2..9020d8467 100644
--- a/embassy-stm32/src/cordic/errors.rs
+++ b/embassy-stm32/src/cordic/errors.rs
@@ -46,6 +46,7 @@ impl defmt::Format for CordicError {
 }
 
 /// Error dring parsing [Cordic::Config](super::Config)
+#[allow(dead_code)]
 #[derive(Debug)]
 pub struct ConfigError {
     pub(super) func: Function,
@@ -71,6 +72,7 @@ impl defmt::Format for ConfigError {
 }
 
 /// Error on checking input arguments
+#[allow(dead_code)]
 #[derive(Debug)]
 pub struct ArgError {
     pub(super) func: Function,
diff --git a/tests/stm32/Cargo.toml b/tests/stm32/Cargo.toml
index e42470004..345c72a03 100644
--- a/tests/stm32/Cargo.toml
+++ b/tests/stm32/Cargo.toml
@@ -15,7 +15,7 @@ stm32f446re = ["embassy-stm32/stm32f446re", "chrono", "stop", "can", "not-gpdma"
 stm32f767zi = ["embassy-stm32/stm32f767zi", "chrono", "not-gpdma", "eth", "rng"]
 stm32g071rb = ["embassy-stm32/stm32g071rb", "cm0", "not-gpdma", "dac", "ucpd"]
 stm32g491re = ["embassy-stm32/stm32g491re", "chrono", "stop", "not-gpdma", "rng", "fdcan"]
-stm32h563zi = ["embassy-stm32/stm32h563zi", "chrono", "eth", "rng", "hash"]
+stm32h563zi = ["embassy-stm32/stm32h563zi", "chrono", "eth", "rng", "hash", "cordic"]
 stm32h753zi = ["embassy-stm32/stm32h753zi", "chrono", "not-gpdma", "eth", "rng", "fdcan", "hash", "cryp"]
 stm32h755zi = ["embassy-stm32/stm32h755zi-cm7", "chrono", "not-gpdma", "eth", "dac", "rng", "fdcan", "hash", "cryp"]
 stm32h7a3zi = ["embassy-stm32/stm32h7a3zi", "not-gpdma", "rng", "fdcan"]
@@ -25,8 +25,8 @@ stm32l496zg = ["embassy-stm32/stm32l496zg", "not-gpdma", "rng"]
 stm32l4a6zg = ["embassy-stm32/stm32l4a6zg", "chrono", "not-gpdma", "rng", "hash"]
 stm32l4r5zi = ["embassy-stm32/stm32l4r5zi", "chrono", "not-gpdma", "rng"]
 stm32l552ze = ["embassy-stm32/stm32l552ze", "not-gpdma", "rng", "hash"]
-stm32u585ai = ["embassy-stm32/stm32u585ai", "chrono", "rng", "hash"]
-stm32u5a5zj = ["embassy-stm32/stm32u5a5zj", "chrono", "rng", "hash"]
+stm32u585ai = ["embassy-stm32/stm32u585ai", "chrono", "rng", "hash", "cordic"]
+stm32u5a5zj = ["embassy-stm32/stm32u5a5zj", "chrono", "rng", "hash", "cordic"]
 stm32wb55rg = ["embassy-stm32/stm32wb55rg", "chrono", "not-gpdma", "ble", "mac" , "rng"]
 stm32wba52cg = ["embassy-stm32/stm32wba52cg", "chrono", "rng", "hash"]
 stm32wl55jc = ["embassy-stm32/stm32wl55jc-cm4", "not-gpdma", "rng", "chrono"]
@@ -48,6 +48,7 @@ embassy-stm32-wpan = []
 not-gpdma = []
 dac = []
 ucpd = []
+cordic = ["dep:num-traits"]
 
 cm0 = ["portable-atomic/unsafe-assume-single-core"]
 
@@ -83,6 +84,7 @@ chrono = { version = "^0.4", default-features = false, optional = true}
 sha2 = { version = "0.10.8", default-features = false }
 hmac = "0.12.1"
 aes-gcm = {version = "0.10.3", default-features = false, features = ["aes", "heapless"] }
+num-traits = {version="0.2", default-features = false,features = ["libm"], optional = true}
 
 # BEGIN TESTS
 # Generated by gen_test.py. DO NOT EDIT.
@@ -91,6 +93,11 @@ name = "can"
 path = "src/bin/can.rs"
 required-features = [ "can",]
 
+[[bin]]
+name = "cordic"
+path = "src/bin/cordic.rs"
+required-features = [ "rng", "cordic",]
+
 [[bin]]
 name = "cryp"
 path = "src/bin/cryp.rs"
diff --git a/tests/stm32/gen_test.py b/tests/stm32/gen_test.py
index 8ff156c0e..daf714376 100644
--- a/tests/stm32/gen_test.py
+++ b/tests/stm32/gen_test.py
@@ -14,7 +14,7 @@ for f in sorted(glob('./src/bin/*.rs')):
     with open(f, 'r') as f:
         for line in f:
             if line.startswith('// required-features:'):
-                features = line.split(':', 2)[1].strip().split(',')
+                features = [feature.strip() for feature in line.split(':', 2)[1].strip().split(',')]
 
     tests[name] = features
 
diff --git a/tests/stm32/src/bin/cordic.rs b/tests/stm32/src/bin/cordic.rs
new file mode 100644
index 000000000..b580cc79b
--- /dev/null
+++ b/tests/stm32/src/bin/cordic.rs
@@ -0,0 +1,152 @@
+// required-features: rng, cordic
+
+// Test Cordic driver, with Q1.31 format, Sin function, at 24 iterations (aka PRECISION = 6), using DMA transfer
+
+// Only test on STM32H563ZI, STM32U585AI and STM32U5a5JI.
+// STM32G491RE is not tested, since it memory.x has less memory size than it actually has,
+// and the test seems use much memory than memory.x suggest.
+// see https://github.com/embassy-rs/stm32-data/issues/301#issuecomment-1925412561
+
+#![no_std]
+#![no_main]
+
+use defmt::*;
+use embassy_executor::Spawner;
+use embassy_stm32::{bind_interrupts, cordic, peripherals, rng};
+use num_traits::Float;
+use {defmt_rtt as _, panic_probe as _};
+
+bind_interrupts!(struct Irqs {
+   RNG => rng::InterruptHandler<peripherals::RNG>;
+});
+
+/* input value control, can be changed */
+
+const ARG1_LENGTH: usize = 9;
+const ARG2_LENGTH: usize = 4; // this might not be the exact length of ARG2, since ARG2 need to be inside [0, 1]
+
+const INPUT_Q1_31_LENGHT: usize = ARG1_LENGTH + ARG2_LENGTH;
+const INPUT_U8_LENGTH: usize = 4 * INPUT_Q1_31_LENGHT;
+
+#[embassy_executor::main]
+async fn main(_spawner: Spawner) {
+    let dp = embassy_stm32::init(Default::default());
+
+    //
+    // use RNG generate random Q1.31 value
+    //
+    // we don't generate floating-point value, since not all binary value are valid floating-point value,
+    // and Q1.31 only accept a fixed range of value.
+
+    let mut rng = rng::Rng::new(dp.RNG, Irqs);
+
+    let mut input_buf_u8 = [0u8; INPUT_U8_LENGTH];
+    unwrap!(rng.async_fill_bytes(&mut input_buf_u8).await);
+
+    // convert every [u8; 4] to a u32, for a Q1.31 value
+    let input_q1_31 = unsafe { core::mem::transmute::<[u8; INPUT_U8_LENGTH], [u32; INPUT_Q1_31_LENGHT]>(input_buf_u8) };
+
+    let mut input_f64_buf = [0f64; INPUT_Q1_31_LENGHT];
+
+    let mut cordic_output_f64_buf = [0f64; ARG1_LENGTH * 2];
+
+    // convert Q1.31 value back to f64, for software calculation verify
+    for (val_u32, val_f64) in input_q1_31.iter().zip(input_f64_buf.iter_mut()) {
+        *val_f64 = cordic::utils::q1_31_to_f64(*val_u32);
+    }
+
+    let mut arg2_f64_buf = [0f64; ARG2_LENGTH];
+    let mut arg2_f64_len = 0;
+
+    // check if ARG2 is in range [0, 1] (limited by CORDIC peripheral with Sin mode)
+    for &arg2 in &input_f64_buf[ARG1_LENGTH..] {
+        if arg2 >= 0.0 {
+            arg2_f64_buf[arg2_f64_len] = arg2;
+            arg2_f64_len += 1;
+        }
+    }
+
+    // the actal value feed to CORDIC
+    let arg1_f64_ls = &input_f64_buf[..ARG1_LENGTH];
+    let arg2_f64_ls = &arg2_f64_buf[..arg2_f64_len];
+
+    let mut cordic = cordic::Cordic::new(
+        dp.CORDIC,
+        unwrap!(cordic::Config::new(
+            cordic::Function::Sin,
+            Default::default(),
+            Default::default(),
+            false,
+        )),
+    );
+
+    //#[cfg(feature = "stm32g491re")]
+    //let (mut write_dma, mut read_dma) = (dp.DMA1_CH4, dp.DMA1_CH5);
+
+    #[cfg(any(feature = "stm32h563zi", feature = "stm32u585ai", feature = "stm32u5a5zj"))]
+    let (mut write_dma, mut read_dma) = (dp.GPDMA1_CH4, dp.GPDMA1_CH5);
+
+    let cordic_start_point = embassy_time::Instant::now();
+
+    let cnt = unwrap!(
+        cordic
+            .async_calc_32bit(
+                &mut write_dma,
+                &mut read_dma,
+                arg1_f64_ls,
+                Some(arg2_f64_ls),
+                &mut cordic_output_f64_buf,
+            )
+            .await
+    );
+
+    let cordic_end_point = embassy_time::Instant::now();
+
+    // since we get 2 output for 1 calculation, the output length should be ARG1_LENGTH * 2
+    defmt::assert!(cnt == ARG1_LENGTH * 2);
+
+    let mut software_output_f64_buf = [0f64; ARG1_LENGTH * 2];
+
+    // for software calc, if there is no ARG2 value, insert a 1.0 as value (the reset value for ARG2 in CORDIC)
+    let arg2_f64_ls = if arg2_f64_len == 0 { &[1.0] } else { arg2_f64_ls };
+
+    let software_inputs = arg1_f64_ls
+        .iter()
+        .zip(
+            arg2_f64_ls
+                .iter()
+                .chain(core::iter::repeat(&arg2_f64_ls[arg2_f64_ls.len() - 1])),
+        )
+        .zip(software_output_f64_buf.chunks_mut(2));
+
+    let software_start_point = embassy_time::Instant::now();
+
+    for ((arg1, arg2), res) in software_inputs {
+        let (raw_res1, raw_res2) = (arg1 * core::f64::consts::PI).sin_cos();
+
+        (res[0], res[1]) = (raw_res1 * arg2, raw_res2 * arg2);
+    }
+
+    let software_end_point = embassy_time::Instant::now();
+
+    for (cordic_res, software_res) in cordic_output_f64_buf[..cnt]
+        .chunks(2)
+        .zip(software_output_f64_buf.chunks(2))
+    {
+        for (cord_res, soft_res) in cordic_res.iter().zip(software_res.iter()) {
+            defmt::assert!((cord_res - soft_res).abs() <= 2.0.powi(-19));
+        }
+    }
+
+    // This comparsion is just for fun. Since it not a equal compare:
+    // software use 64-bit floating point, but CORDIC use 32-bit fixed point.
+    trace!(
+        "calculate count: {}, Cordic time: {} us, software time: {} us",
+        ARG1_LENGTH,
+        (cordic_end_point - cordic_start_point).as_micros(),
+        (software_end_point - software_start_point).as_micros()
+    );
+
+    info!("Test OK");
+    cortex_m::asm::bkpt();
+}

From fac4f9aa2f67aa6c7f522a10a4c546434a39883e Mon Sep 17 00:00:00 2001
From: eZio Pan <eziopan@qq.com>
Date: Fri, 22 Mar 2024 00:24:53 +0800
Subject: [PATCH 11/17] stm32 CORDIC: typo fix

---
 embassy-stm32/src/cordic/enums.rs  |  2 +-
 embassy-stm32/src/cordic/errors.rs |  2 +-
 embassy-stm32/src/cordic/mod.rs    | 18 +++++++++---------
 embassy-stm32/src/cordic/utils.rs  |  4 ++--
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/embassy-stm32/src/cordic/enums.rs b/embassy-stm32/src/cordic/enums.rs
index 4b92a6cf8..e8695fac7 100644
--- a/embassy-stm32/src/cordic/enums.rs
+++ b/embassy-stm32/src/cordic/enums.rs
@@ -25,7 +25,7 @@ pub enum Precision {
     Iters16,
     Iters20,
     #[default]
-    Iters24, // this value is recomended by Reference Manual
+    Iters24, // this value is recommended by Reference Manual
     Iters28,
     Iters32,
     Iters36,
diff --git a/embassy-stm32/src/cordic/errors.rs b/embassy-stm32/src/cordic/errors.rs
index 9020d8467..653014290 100644
--- a/embassy-stm32/src/cordic/errors.rs
+++ b/embassy-stm32/src/cordic/errors.rs
@@ -45,7 +45,7 @@ impl defmt::Format for CordicError {
     }
 }
 
-/// Error dring parsing [Cordic::Config](super::Config)
+/// Error during parsing [Cordic::Config](super::Config)
 #[allow(dead_code)]
 #[derive(Debug)]
 pub struct ConfigError {
diff --git a/embassy-stm32/src/cordic/mod.rs b/embassy-stm32/src/cordic/mod.rs
index b0db3f060..f12efe2eb 100644
--- a/embassy-stm32/src/cordic/mod.rs
+++ b/embassy-stm32/src/cordic/mod.rs
@@ -91,8 +91,8 @@ impl<'d, T: Instance> Cordic<'d, T> {
     /// Create a Cordic driver instance
     ///
     /// Note:  
-    /// If you need a periperhal -> CORDIC -> peripehral mode,  
-    /// you may want to set Cordic into [Mode::ZeroOverhead] mode, and add extra arguemnts with [Self::extra_config]
+    /// If you need a peripheral -> CORDIC -> peripheral mode,  
+    /// you may want to set Cordic into [Mode::ZeroOverhead] mode, and add extra arguments with [Self::extra_config]
     pub fn new(peri: impl Peripheral<P = T> + 'd, config: Config) -> Self {
         T::enable_and_reset();
 
@@ -123,7 +123,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
         self.peri.set_scale(self.config.scale);
 
         // we don't set NRES in here, but to make sure NRES is set each time user call "calc"-ish functions,
-        // since each "calc"-ish functions can have different ARGSIZE and RESSIZE, thus NRES should be change accrodingly.
+        // since each "calc"-ish functions can have different ARGSIZE and RESSIZE, thus NRES should be change accordingly.
     }
 
     async fn launch_a_dma_transfer(
@@ -256,7 +256,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
             self.blocking_write_f64(input_left[0])?;
 
             for &arg in input_left.iter().skip(1) {
-                // this line write arg for next round caculation to cordic,
+                // this line write arg for next round calculation to cordic,
                 // and read result from last round
                 self.blocking_write_f64(arg)?;
                 self.blocking_read_f64_to_buf(output, &mut output_count);
@@ -426,8 +426,8 @@ impl<'d, T: Instance> Cordic<'d, T> {
         write_dma: impl Peripheral<P = impl WriteDma<T>>,
         read_dma: impl Peripheral<P = impl ReadDma<T>>,
         double_input: bool,             // gether extra info to calc output_buf size
-        input_buf: &[u32],              // input_buf, its content should be extact values and length for calculation
-        output: &mut [f64],             // caller uses should this as a final output array
+        input_buf: &[u32],              // input_buf, its content should be exact length for calculation
+        output: &mut [f64],             // caller should uses this buf as a final output array
         output_start_index: &mut usize, // the index of start point of the output for this round of calculation
     ) {
         // output_buf is the place to store raw value from CORDIC (via DMA).
@@ -581,7 +581,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
         // In q1.15 mode, we always fill 1 pair of 16bit value into WDATA register.
         // If arg2s is None or empty array, we assume arg2 value always 1.0 (as reset value for ARG2).
         // If arg2s has some value, and but not as long as arg1s,
-        // we fill the reset of arg2 values with last value from arg2s (as q1.31 version does)
+        // we fill the reset of arg2 values with last value from arg2s (as CORDIC behavior on q1.31 format)
 
         let arg2_default_value = match arg2s {
             Some(arg2s) if !arg2s.is_empty() => arg2s[arg2s.len() - 1],
@@ -624,8 +624,8 @@ impl<'d, T: Instance> Cordic<'d, T> {
         &mut self,
         write_dma: impl Peripheral<P = impl WriteDma<T>>,
         read_dma: impl Peripheral<P = impl ReadDma<T>>,
-        input_buf: &[u32],  // input_buf, its content should be extact values and length for calculation
-        output: &mut [f32], // caller uses should this as a final output array
+        input_buf: &[u32],              // input_buf, its content should be exact length for calculation
+        output: &mut [f32],             // caller should uses this buf as a final output array
         output_start_index: &mut usize, // the index of start point of the output for this round of calculation
     ) {
         // output_buf is the place to store raw value from CORDIC (via DMA).
diff --git a/embassy-stm32/src/cordic/utils.rs b/embassy-stm32/src/cordic/utils.rs
index 3c3ed224f..41821d6e2 100644
--- a/embassy-stm32/src/cordic/utils.rs
+++ b/embassy-stm32/src/cordic/utils.rs
@@ -25,7 +25,7 @@ macro_rules! floating_fixed_convert {
             };
 
             // It's necessary to cast the float value to signed integer, before convert it to a unsigned value.
-            // Since value from register is actually a "signed value", a "as" cast will keep original binary format but mark it as unsgined value.
+            // Since value from register is actually a "signed value", a "as" cast will keep original binary format but mark it as a unsigned value for register writing.
             // see https://doc.rust-lang.org/reference/expressions/operator-expr.html#numeric-cast
             Ok((value * ((1 as $unsigned_bin_typ << $offset) as $float_ty)) as $signed_bin_typ as $unsigned_bin_typ)
         }
@@ -34,7 +34,7 @@ macro_rules! floating_fixed_convert {
         /// convert fixed point to float point format
         pub fn $q_to_f(value: $unsigned_bin_typ) -> $float_ty {
             // It's necessary to cast the unsigned integer to signed integer, before convert it to a float value.
-            // Since value from register is actually a "signed value", a "as" cast will keep original binary format but mark it as signed value.
+            // Since value from register is actually a "signed value", a "as" cast will keep original binary format but mark it as a signed value.
             // see https://doc.rust-lang.org/reference/expressions/operator-expr.html#numeric-cast
             (value as $signed_bin_typ as $float_ty) / ((1 as $unsigned_bin_typ << $offset) as $float_ty)
         }

From 441aa4c8cede2b63cc55b51db6eb89b1e35671f9 Mon Sep 17 00:00:00 2001
From: eZio Pan <eziopan@qq.com>
Date: Fri, 22 Mar 2024 00:39:43 +0800
Subject: [PATCH 12/17] stm32 CORDIC: make HIL run

---
 tests/stm32/src/bin/cordic.rs | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/tests/stm32/src/bin/cordic.rs b/tests/stm32/src/bin/cordic.rs
index b580cc79b..cd2e9d6f7 100644
--- a/tests/stm32/src/bin/cordic.rs
+++ b/tests/stm32/src/bin/cordic.rs
@@ -4,13 +4,15 @@
 
 // Only test on STM32H563ZI, STM32U585AI and STM32U5a5JI.
 // STM32G491RE is not tested, since it memory.x has less memory size than it actually has,
-// and the test seems use much memory than memory.x suggest.
+// and the test seems use more memory than memory.x suggest.
 // see https://github.com/embassy-rs/stm32-data/issues/301#issuecomment-1925412561
 
 #![no_std]
 #![no_main]
 
-use defmt::*;
+#[path = "../common.rs"]
+mod common;
+use common::*;
 use embassy_executor::Spawner;
 use embassy_stm32::{bind_interrupts, cordic, peripherals, rng};
 use num_traits::Float;
@@ -25,12 +27,12 @@ bind_interrupts!(struct Irqs {
 const ARG1_LENGTH: usize = 9;
 const ARG2_LENGTH: usize = 4; // this might not be the exact length of ARG2, since ARG2 need to be inside [0, 1]
 
-const INPUT_Q1_31_LENGHT: usize = ARG1_LENGTH + ARG2_LENGTH;
-const INPUT_U8_LENGTH: usize = 4 * INPUT_Q1_31_LENGHT;
+const INPUT_Q1_31_LENGTH: usize = ARG1_LENGTH + ARG2_LENGTH;
+const INPUT_U8_LENGTH: usize = 4 * INPUT_Q1_31_LENGTH;
 
 #[embassy_executor::main]
 async fn main(_spawner: Spawner) {
-    let dp = embassy_stm32::init(Default::default());
+    let dp = embassy_stm32::init(config());
 
     //
     // use RNG generate random Q1.31 value
@@ -41,12 +43,12 @@ async fn main(_spawner: Spawner) {
     let mut rng = rng::Rng::new(dp.RNG, Irqs);
 
     let mut input_buf_u8 = [0u8; INPUT_U8_LENGTH];
-    unwrap!(rng.async_fill_bytes(&mut input_buf_u8).await);
+    defmt::unwrap!(rng.async_fill_bytes(&mut input_buf_u8).await);
 
     // convert every [u8; 4] to a u32, for a Q1.31 value
-    let input_q1_31 = unsafe { core::mem::transmute::<[u8; INPUT_U8_LENGTH], [u32; INPUT_Q1_31_LENGHT]>(input_buf_u8) };
+    let input_q1_31 = unsafe { core::mem::transmute::<[u8; INPUT_U8_LENGTH], [u32; INPUT_Q1_31_LENGTH]>(input_buf_u8) };
 
-    let mut input_f64_buf = [0f64; INPUT_Q1_31_LENGHT];
+    let mut input_f64_buf = [0f64; INPUT_Q1_31_LENGTH];
 
     let mut cordic_output_f64_buf = [0f64; ARG1_LENGTH * 2];
 
@@ -66,13 +68,13 @@ async fn main(_spawner: Spawner) {
         }
     }
 
-    // the actal value feed to CORDIC
+    // the actual value feed to CORDIC
     let arg1_f64_ls = &input_f64_buf[..ARG1_LENGTH];
     let arg2_f64_ls = &arg2_f64_buf[..arg2_f64_len];
 
     let mut cordic = cordic::Cordic::new(
         dp.CORDIC,
-        unwrap!(cordic::Config::new(
+        defmt::unwrap!(cordic::Config::new(
             cordic::Function::Sin,
             Default::default(),
             Default::default(),
@@ -138,9 +140,9 @@ async fn main(_spawner: Spawner) {
         }
     }
 
-    // This comparsion is just for fun. Since it not a equal compare:
+    // This comparison is just for fun. Since it not a equal compare:
     // software use 64-bit floating point, but CORDIC use 32-bit fixed point.
-    trace!(
+    defmt::trace!(
         "calculate count: {}, Cordic time: {} us, software time: {} us",
         ARG1_LENGTH,
         (cordic_end_point - cordic_start_point).as_micros(),

From 83069e7b49bd181236e6a68005ad6119d39b39c3 Mon Sep 17 00:00:00 2001
From: eZio Pan <eziopan@qq.com>
Date: Fri, 22 Mar 2024 00:58:03 +0800
Subject: [PATCH 13/17] stm32 CORDIC: add example

---
 examples/stm32h5/src/bin/cordic.rs | 35 ++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 examples/stm32h5/src/bin/cordic.rs

diff --git a/examples/stm32h5/src/bin/cordic.rs b/examples/stm32h5/src/bin/cordic.rs
new file mode 100644
index 000000000..d49f75b8f
--- /dev/null
+++ b/examples/stm32h5/src/bin/cordic.rs
@@ -0,0 +1,35 @@
+#![no_std]
+#![no_main]
+
+use defmt::*;
+use embassy_executor::Spawner;
+use embassy_stm32::cordic;
+use {defmt_rtt as _, panic_probe as _};
+
+#[embassy_executor::main]
+async fn main(_spawner: Spawner) {
+    let mut dp = embassy_stm32::init(Default::default());
+
+    let mut cordic = cordic::Cordic::new(
+        &mut dp.CORDIC,
+        unwrap!(cordic::Config::new(
+            cordic::Function::Sin,
+            Default::default(),
+            Default::default(),
+            false,
+        )),
+    );
+
+    let mut output = [0f64; 16];
+
+    let arg1 = [1.0, 0.0, -1.0]; // for trigonometric function, the ARG1 value [-pi, pi] should be map to [-1, 1]
+    let arg2 = [0.5, 1.0];
+
+    let cnt = unwrap!(
+        cordic
+            .async_calc_32bit(&mut dp.GPDMA1_CH0, &mut dp.GPDMA1_CH1, &arg1, Some(&arg2), &mut output,)
+            .await
+    );
+
+    println!("async calc 32bit: {}", output[..cnt]);
+}

From 0abcccee966af0b12e62fc7fae8499fa03194823 Mon Sep 17 00:00:00 2001
From: eZio Pan <eziopan@qq.com>
Date: Fri, 22 Mar 2024 17:29:10 +0800
Subject: [PATCH 14/17] stm32 CORDIC: re-design API

---
 embassy-stm32/src/cordic/errors.rs |  67 +--
 embassy-stm32/src/cordic/mod.rs    | 781 +++++++++++------------------
 embassy-stm32/src/cordic/utils.rs  |  15 +-
 examples/stm32h5/src/bin/cordic.rs |  59 ++-
 tests/stm32/src/bin/cordic.rs      | 110 ++--
 5 files changed, 434 insertions(+), 598 deletions(-)

diff --git a/embassy-stm32/src/cordic/errors.rs b/embassy-stm32/src/cordic/errors.rs
index 653014290..3c70fc9e7 100644
--- a/embassy-stm32/src/cordic/errors.rs
+++ b/embassy-stm32/src/cordic/errors.rs
@@ -5,12 +5,14 @@ use super::{Function, Scale};
 pub enum CordicError {
     /// Config error
     ConfigError(ConfigError),
-    /// Argument error
-    ArgError(ArgError),
-    /// Output buffer length error
-    OutputLengthNotEnough,
+    /// Argument length is incorrect
+    ArgumentLengthIncorrect,
+    /// Result buffer length error
+    ResultLengthNotEnough,
     /// Input value is out of range for Q1.x format
     NumberOutOfRange(NumberOutOfRange),
+    /// Argument error
+    ArgError(ArgError),
 }
 
 impl From<ConfigError> for CordicError {
@@ -19,18 +21,18 @@ impl From<ConfigError> for CordicError {
     }
 }
 
-impl From<ArgError> for CordicError {
-    fn from(value: ArgError) -> Self {
-        Self::ArgError(value)
-    }
-}
-
 impl From<NumberOutOfRange> for CordicError {
     fn from(value: NumberOutOfRange) -> Self {
         Self::NumberOutOfRange(value)
     }
 }
 
+impl From<ArgError> for CordicError {
+    fn from(value: ArgError) -> Self {
+        Self::ArgError(value)
+    }
+}
+
 #[cfg(feature = "defmt")]
 impl defmt::Format for CordicError {
     fn format(&self, fmt: defmt::Formatter) {
@@ -38,9 +40,10 @@ impl defmt::Format for CordicError {
 
         match self {
             ConfigError(e) => defmt::write!(fmt, "{}", e),
-            ArgError(e) => defmt::write!(fmt, "{}", e),
+            ResultLengthNotEnough => defmt::write!(fmt, "Output buffer length is not long enough"),
+            ArgumentLengthIncorrect => defmt::write!(fmt, "Argument length incorrect"),
             NumberOutOfRange(e) => defmt::write!(fmt, "{}", e),
-            OutputLengthNotEnough => defmt::write!(fmt, "Output buffer length is not long enough"),
+            ArgError(e) => defmt::write!(fmt, "{}", e),
         }
     }
 }
@@ -71,6 +74,26 @@ impl defmt::Format for ConfigError {
     }
 }
 
+/// Input value is out of range for Q1.x format
+#[allow(missing_docs)]
+#[derive(Debug)]
+pub enum NumberOutOfRange {
+    BelowLowerBound,
+    AboveUpperBound,
+}
+
+#[cfg(feature = "defmt")]
+impl defmt::Format for NumberOutOfRange {
+    fn format(&self, fmt: defmt::Formatter) {
+        use NumberOutOfRange::*;
+
+        match self {
+            BelowLowerBound => defmt::write!(fmt, "input value should be equal or greater than -1"),
+            AboveUpperBound => defmt::write!(fmt, "input value should be equal or less than 1"),
+        }
+    }
+}
+
 /// Error on checking input arguments
 #[allow(dead_code)]
 #[derive(Debug)]
@@ -119,23 +142,3 @@ pub(super) enum ArgType {
     Arg1,
     Arg2,
 }
-
-/// Input value is out of range for Q1.x format
-#[allow(missing_docs)]
-#[derive(Debug)]
-pub enum NumberOutOfRange {
-    BelowLowerBound,
-    AboveUpperBound,
-}
-
-#[cfg(feature = "defmt")]
-impl defmt::Format for NumberOutOfRange {
-    fn format(&self, fmt: defmt::Formatter) {
-        use NumberOutOfRange::*;
-
-        match self {
-            BelowLowerBound => defmt::write!(fmt, "input value should be equal or greater than -1"),
-            AboveUpperBound => defmt::write!(fmt, "input value should be equal or less than 1"),
-        }
-    }
-}
diff --git a/embassy-stm32/src/cordic/mod.rs b/embassy-stm32/src/cordic/mod.rs
index f12efe2eb..2479e1b27 100644
--- a/embassy-stm32/src/cordic/mod.rs
+++ b/embassy-stm32/src/cordic/mod.rs
@@ -21,8 +21,6 @@ pub mod low_level {
     pub use super::sealed::*;
 }
 
-const INPUT_BUF_MAX_LEN: usize = 16;
-
 /// CORDIC driver
 pub struct Cordic<'d, T: Instance> {
     peri: PeripheralRef<'d, T>,
@@ -38,17 +36,15 @@ pub struct Config {
     function: Function,
     precision: Precision,
     scale: Scale,
-    res1_only: bool,
 }
 
 impl Config {
     /// Create a config for Cordic driver
-    pub fn new(function: Function, precision: Precision, scale: Scale, res1_only: bool) -> Result<Self, CordicError> {
+    pub fn new(function: Function, precision: Precision, scale: Scale) -> Result<Self, CordicError> {
         let config = Self {
             function,
             precision,
             scale,
-            res1_only,
         };
 
         config.check_scale()?;
@@ -117,7 +113,32 @@ impl<'d, T: Instance> Cordic<'d, T> {
         self.peri.set_data_width(arg_width, res_width);
     }
 
-    fn reconfigure(&mut self) {
+    fn clean_rrdy_flag(&mut self) {
+        while self.peri.ready_to_read() {
+            self.peri.read_result();
+        }
+    }
+
+    /// Disable IRQ and DMA, clean RRDY, and set ARG2 to +1 (0x7FFFFFFF)
+    pub fn reconfigure(&mut self) {
+        // reset ARG2 to +1
+        {
+            self.peri.disable_irq();
+            self.peri.disable_read_dma();
+            self.peri.disable_write_dma();
+            self.clean_rrdy_flag();
+
+            self.peri.set_func(Function::Cos);
+            self.peri.set_precision(Precision::Iters4);
+            self.peri.set_scale(Scale::Arg1Res1);
+            self.peri.set_argument_count(AccessCount::Two);
+            self.peri.set_data_width(Width::Bits32, Width::Bits32);
+            self.peri.write_argument(0x0u32);
+            self.peri.write_argument(0x7FFFFFFFu32);
+
+            self.clean_rrdy_flag();
+        }
+
         self.peri.set_func(self.config.function);
         self.peri.set_precision(self.config.precision);
         self.peri.set_scale(self.config.scale);
@@ -125,16 +146,154 @@ impl<'d, T: Instance> Cordic<'d, T> {
         // we don't set NRES in here, but to make sure NRES is set each time user call "calc"-ish functions,
         // since each "calc"-ish functions can have different ARGSIZE and RESSIZE, thus NRES should be change accordingly.
     }
+}
 
-    async fn launch_a_dma_transfer(
+impl<'d, T: Instance> Drop for Cordic<'d, T> {
+    fn drop(&mut self) {
+        T::disable();
+    }
+}
+
+// q1.31 related
+impl<'d, T: Instance> Cordic<'d, T> {
+    /// Run a blocking CORDIC calculation in q1.31 format  
+    ///
+    /// Notice:  
+    /// If you set `arg1_only` to `true`, please be sure ARG2 value has been set to desired value before.  
+    /// This function won't set ARG2 to +1 before or after each round of calculation.  
+    /// If you want to make sure ARG2 is set to +1, consider run [.reconfigure()](Self::reconfigure).
+    pub fn blocking_calc_32bit(
+        &mut self,
+        arg: &[u32],
+        res: &mut [u32],
+        arg1_only: bool,
+        res1_only: bool,
+    ) -> Result<usize, CordicError> {
+        if arg.is_empty() {
+            return Ok(0);
+        }
+
+        let res_cnt = Self::check_arg_res_length_32bit(arg.len(), res.len(), arg1_only, res1_only)?;
+
+        self.peri
+            .set_argument_count(if arg1_only { AccessCount::One } else { AccessCount::Two });
+
+        self.peri
+            .set_result_count(if res1_only { AccessCount::One } else { AccessCount::Two });
+
+        self.peri.set_data_width(Width::Bits32, Width::Bits32);
+
+        let mut cnt = 0;
+
+        match arg1_only {
+            true => {
+                // To use cordic preload function, the first value is special.
+                // It is loaded to CORDIC WDATA register out side of loop
+                let first_value = arg[0];
+
+                // preload 1st value to CORDIC, to start the CORDIC calc
+                self.peri.write_argument(first_value);
+
+                for &arg1 in &arg[1..] {
+                    // preload arg1 (for next calc)
+                    self.peri.write_argument(arg1);
+
+                    // then read current result out
+                    res[cnt] = self.peri.read_result();
+                    cnt += 1;
+                    if !res1_only {
+                        res[cnt] = self.peri.read_result();
+                        cnt += 1;
+                    }
+                }
+
+                // read the last result
+                res[cnt] = self.peri.read_result();
+                cnt += 1;
+                if !res1_only {
+                    res[cnt] = self.peri.read_result();
+                    // cnt += 1;
+                }
+            }
+            false => {
+                // To use cordic preload function, the first and last value is special.
+                // They are load to CORDIC WDATA register out side of loop
+                let first_value = arg[0];
+                let last_value = arg[arg.len() - 1];
+
+                let paired_args = &arg[1..arg.len() - 1];
+
+                // preload 1st value to CORDIC
+                self.peri.write_argument(first_value);
+
+                for args in paired_args.chunks(2) {
+                    let arg2 = args[0];
+                    let arg1 = args[1];
+
+                    // load arg2 (for current calc) first, to start the CORDIC calc
+                    self.peri.write_argument(arg2);
+
+                    // preload arg1 (for next calc)
+                    self.peri.write_argument(arg1);
+
+                    // then read current result out
+                    res[cnt] = self.peri.read_result();
+                    cnt += 1;
+                    if !res1_only {
+                        res[cnt] = self.peri.read_result();
+                        cnt += 1;
+                    }
+                }
+
+                // load last value to CORDIC, and finish the calculation
+                self.peri.write_argument(last_value);
+                res[cnt] = self.peri.read_result();
+                cnt += 1;
+                if !res1_only {
+                    res[cnt] = self.peri.read_result();
+                    // cnt += 1;
+                }
+            }
+        }
+
+        // at this point cnt should be equal to res_cnt
+
+        Ok(res_cnt)
+    }
+
+    /// Run a async CORDIC calculation in q.1.31 format
+    ///
+    /// Notice:  
+    /// If you set `arg1_only` to `true`, please be sure ARG2 value has been set to desired value before.  
+    /// This function won't set ARG2 to +1 before or after each round of calculation.  
+    /// If you want to make sure ARG2 is set to +1, consider run [.reconfigure()](Self::reconfigure).
+    pub async fn async_calc_32bit(
         &mut self,
         write_dma: impl Peripheral<P = impl WriteDma<T>>,
         read_dma: impl Peripheral<P = impl ReadDma<T>>,
-        input: &[u32],
-        output: &mut [u32],
-    ) {
+        arg: &[u32],
+        res: &mut [u32],
+        arg1_only: bool,
+        res1_only: bool,
+    ) -> Result<usize, CordicError> {
+        if arg.is_empty() {
+            return Ok(0);
+        }
+
+        let res_cnt = Self::check_arg_res_length_32bit(arg.len(), res.len(), arg1_only, res1_only)?;
+
+        let active_res_buf = &mut res[..res_cnt];
+
         into_ref!(write_dma, read_dma);
 
+        self.peri
+            .set_argument_count(if arg1_only { AccessCount::One } else { AccessCount::Two });
+
+        self.peri
+            .set_result_count(if res1_only { AccessCount::One } else { AccessCount::Two });
+
+        self.peri.set_data_width(Width::Bits32, Width::Bits32);
+
         let write_req = write_dma.request();
         let read_req = read_dma.request();
 
@@ -150,7 +309,7 @@ impl<'d, T: Instance> Cordic<'d, T> {
             let write_transfer = dma::Transfer::new_write(
                 &mut write_dma,
                 write_req,
-                input,
+                arg,
                 T::regs().wdata().as_ptr() as *mut _,
                 Default::default(),
             );
@@ -159,328 +318,60 @@ impl<'d, T: Instance> Cordic<'d, T> {
                 &mut read_dma,
                 read_req,
                 T::regs().rdata().as_ptr() as *mut _,
-                output,
+                active_res_buf,
                 Default::default(),
             );
 
             embassy_futures::join::join(write_transfer, read_transfer).await;
         }
-    }
-}
 
-impl<'d, T: Instance> Drop for Cordic<'d, T> {
-    fn drop(&mut self) {
-        T::disable();
+        Ok(res_cnt)
     }
-}
 
-// q1.31 related
-impl<'d, T: Instance> Cordic<'d, T> {
-    /// Run a blocking CORDIC calculation in q1.31 format
-    pub fn blocking_calc_32bit(
-        &mut self,
-        arg1s: &[f64],
-        arg2s: Option<&[f64]>,
-        output: &mut [f64],
+    fn check_arg_res_length_32bit(
+        arg_len: usize,
+        res_len: usize,
+        arg1_only: bool,
+        res1_only: bool,
     ) -> Result<usize, CordicError> {
-        if arg1s.is_empty() {
-            return Ok(0);
+        if !arg1_only && arg_len % 2 != 0 {
+            return Err(CordicError::ArgumentLengthIncorrect);
         }
 
-        let output_length_enough = match self.config.res1_only {
-            true => output.len() >= arg1s.len(),
-            false => output.len() >= 2 * arg1s.len(),
-        };
+        let mut minimal_res_length = arg_len;
 
-        if !output_length_enough {
-            return Err(CordicError::OutputLengthNotEnough);
+        if !res1_only {
+            minimal_res_length *= 2;
         }
 
-        self.check_input_f64(arg1s, arg2s)?;
-
-        self.peri.set_result_count(if self.config.res1_only {
-            AccessCount::One
-        } else {
-            AccessCount::Two
-        });
-
-        self.peri.set_data_width(Width::Bits32, Width::Bits32);
-
-        let mut output_count = 0;
-
-        let mut consumed_input_len = 0;
-
-        //
-        // handle 2 input args calculation
-        //
-
-        if arg2s.is_some() && !arg2s.unwrap().is_empty() {
-            let arg2s = arg2s.unwrap();
-
-            self.peri.set_argument_count(AccessCount::Two);
-
-            // Skip 1st value from arg1s, this value will be manually "preload" to cordic, to make use of cordic preload function.
-            // And we preserve last value from arg2s, since it need to manually write to cordic, and read the result out.
-            let double_input = arg1s.iter().skip(1).zip(&arg2s[..arg2s.len() - 1]);
-            // Since we preload 1st value from arg1s, the consumed input length is double_input length + 1.
-            consumed_input_len = double_input.len() + 1;
-
-            // preload first value from arg1 to cordic
-            self.blocking_write_f64(arg1s[0])?;
-
-            for (&arg1, &arg2) in double_input {
-                // Since we manually preload a value before,
-                // we will write arg2 (from the actual last pair) first, (at this moment, cordic start to calculating,)
-                // and write arg1 (from the actual next pair), then read the result, to "keep preloading"
-
-                self.blocking_write_f64(arg2)?;
-                self.blocking_write_f64(arg1)?;
-                self.blocking_read_f64_to_buf(output, &mut output_count);
-            }
-
-            // write last input value from arg2s, then read out the result
-            self.blocking_write_f64(arg2s[arg2s.len() - 1])?;
-            self.blocking_read_f64_to_buf(output, &mut output_count);
+        if !arg1_only {
+            minimal_res_length /= 2
         }
 
-        //
-        // handle 1 input arg calculation
-        //
-
-        let input_left = &arg1s[consumed_input_len..];
-
-        if !input_left.is_empty() {
-            self.peri.set_argument_count(AccessCount::One);
-
-            // "preload" value to cordic (at this moment, cordic start to calculating)
-            self.blocking_write_f64(input_left[0])?;
-
-            for &arg in input_left.iter().skip(1) {
-                // this line write arg for next round calculation to cordic,
-                // and read result from last round
-                self.blocking_write_f64(arg)?;
-                self.blocking_read_f64_to_buf(output, &mut output_count);
-            }
-
-            // read the last output
-            self.blocking_read_f64_to_buf(output, &mut output_count);
+        if minimal_res_length > res_len {
+            return Err(CordicError::ResultLengthNotEnough);
         }
 
-        Ok(output_count)
-    }
-
-    fn blocking_read_f64_to_buf(&mut self, result_buf: &mut [f64], result_index: &mut usize) {
-        result_buf[*result_index] = utils::q1_31_to_f64(self.peri.read_result());
-        *result_index += 1;
-
-        // We don't care about whether the function return 1 or 2 results,
-        // the only thing matter is whether user want 1 or 2 results.
-        if !self.config.res1_only {
-            result_buf[*result_index] = utils::q1_31_to_f64(self.peri.read_result());
-            *result_index += 1;
-        }
-    }
-
-    fn blocking_write_f64(&mut self, arg: f64) -> Result<(), NumberOutOfRange> {
-        self.peri.write_argument(utils::f64_to_q1_31(arg)?);
-        Ok(())
-    }
-
-    /// Run a async CORDIC calculation in q.1.31 format
-    pub async fn async_calc_32bit(
-        &mut self,
-        write_dma: impl Peripheral<P = impl WriteDma<T>>,
-        read_dma: impl Peripheral<P = impl ReadDma<T>>,
-        arg1s: &[f64],
-        arg2s: Option<&[f64]>,
-        output: &mut [f64],
-    ) -> Result<usize, CordicError> {
-        if arg1s.is_empty() {
-            return Ok(0);
-        }
-
-        let output_length_enough = match self.config.res1_only {
-            true => output.len() >= arg1s.len(),
-            false => output.len() >= 2 * arg1s.len(),
-        };
-
-        if !output_length_enough {
-            return Err(CordicError::OutputLengthNotEnough);
-        }
-
-        self.check_input_f64(arg1s, arg2s)?;
-
-        into_ref!(write_dma, read_dma);
-
-        self.peri.set_result_count(if self.config.res1_only {
-            AccessCount::One
-        } else {
-            AccessCount::Two
-        });
-
-        self.peri.set_data_width(Width::Bits32, Width::Bits32);
-
-        let mut output_count = 0;
-        let mut consumed_input_len = 0;
-        let mut input_buf = [0u32; INPUT_BUF_MAX_LEN];
-        let mut input_buf_len = 0;
-
-        //
-        // handle 2 input args calculation
-        //
-
-        if !arg2s.unwrap_or_default().is_empty() {
-            let arg2s = arg2s.unwrap();
-
-            self.peri.set_argument_count(AccessCount::Two);
-
-            let double_input = arg1s.iter().zip(arg2s);
-
-            consumed_input_len = double_input.len();
-
-            for (&arg1, &arg2) in double_input {
-                for &arg in [arg1, arg2].iter() {
-                    input_buf[input_buf_len] = utils::f64_to_q1_31(arg)?;
-                    input_buf_len += 1;
-                }
-
-                if input_buf_len == INPUT_BUF_MAX_LEN {
-                    self.inner_dma_calc_32bit(
-                        &mut write_dma,
-                        &mut read_dma,
-                        true,
-                        &input_buf[..input_buf_len],
-                        output,
-                        &mut output_count,
-                    )
-                    .await;
-
-                    input_buf_len = 0;
-                }
-            }
-
-            if input_buf_len > 0 {
-                self.inner_dma_calc_32bit(
-                    &mut write_dma,
-                    &mut read_dma,
-                    true,
-                    &input_buf[..input_buf_len],
-                    output,
-                    &mut output_count,
-                )
-                .await;
-
-                input_buf_len = 0;
-            }
-        }
-
-        //
-        // handle 1 input arg calculation
-        //
-
-        if arg1s.len() > consumed_input_len {
-            let input_remain = &arg1s[consumed_input_len..];
-
-            self.peri.set_argument_count(AccessCount::One);
-
-            for &arg in input_remain {
-                input_buf[input_buf_len] = utils::f64_to_q1_31(arg)?;
-                input_buf_len += 1;
-
-                if input_buf_len == INPUT_BUF_MAX_LEN {
-                    self.inner_dma_calc_32bit(
-                        &mut write_dma,
-                        &mut read_dma,
-                        false,
-                        &input_buf[..input_buf_len],
-                        output,
-                        &mut output_count,
-                    )
-                    .await;
-
-                    input_buf_len = 0;
-                }
-            }
-
-            if input_buf_len > 0 {
-                self.inner_dma_calc_32bit(
-                    &mut write_dma,
-                    &mut read_dma,
-                    false,
-                    &input_buf[..input_buf_len],
-                    output,
-                    &mut output_count,
-                )
-                .await;
-
-                // input_buf_len = 0;
-            }
-        }
-
-        Ok(output_count)
-    }
-
-    // this function is highly coupled with async_calc_32bit, and is not intended to use in other place
-    async fn inner_dma_calc_32bit(
-        &mut self,
-        write_dma: impl Peripheral<P = impl WriteDma<T>>,
-        read_dma: impl Peripheral<P = impl ReadDma<T>>,
-        double_input: bool,             // gether extra info to calc output_buf size
-        input_buf: &[u32],              // input_buf, its content should be exact length for calculation
-        output: &mut [f64],             // caller should uses this buf as a final output array
-        output_start_index: &mut usize, // the index of start point of the output for this round of calculation
-    ) {
-        // output_buf is the place to store raw value from CORDIC (via DMA).
-        // For buf size, we assume in this round of calculation:
-        // all input is 1 arg, and all calculation need 2 output,
-        // thus output_buf will always be long enough.
-        let mut output_buf = [0u32; INPUT_BUF_MAX_LEN * 2];
-
-        let mut output_buf_size = input_buf.len();
-        if !self.config.res1_only {
-            // if we need 2 result for 1 input, then output_buf length should be 2x long.
-            output_buf_size *= 2;
-        };
-        if double_input {
-            // if input itself is 2 args for 1 calculation, then output_buf length should be /2.
-            output_buf_size /= 2;
-        }
-
-        let active_output_buf = &mut output_buf[..output_buf_size];
-
-        self.launch_a_dma_transfer(write_dma, read_dma, input_buf, active_output_buf)
-            .await;
-
-        for &mut output_u32 in active_output_buf {
-            output[*output_start_index] = utils::q1_31_to_f64(output_u32);
-            *output_start_index += 1;
-        }
+        Ok(minimal_res_length)
     }
 }
 
 // q1.15 related
 impl<'d, T: Instance> Cordic<'d, T> {
-    /// Run a blocking CORDIC calculation in q1.15 format
-    pub fn blocking_calc_16bit(
-        &mut self,
-        arg1s: &[f32],
-        arg2s: Option<&[f32]>,
-        output: &mut [f32],
-    ) -> Result<usize, CordicError> {
-        if arg1s.is_empty() {
+    /// Run a blocking CORDIC calculation in q1.15 format  
+    ///
+    /// Notice::  
+    /// User will take respond to merge two u16 arguments into one u32 data, and/or split one u32 data into two u16 results.
+    pub fn blocking_calc_16bit(&mut self, arg: &[u32], res: &mut [u32]) -> Result<usize, CordicError> {
+        if arg.is_empty() {
             return Ok(0);
         }
 
-        let output_length_enough = match self.config.res1_only {
-            true => output.len() >= arg1s.len(),
-            false => output.len() >= 2 * arg1s.len(),
-        };
-
-        if !output_length_enough {
-            return Err(CordicError::OutputLengthNotEnough);
+        if arg.len() > res.len() {
+            return Err(CordicError::ResultLengthNotEnough);
         }
 
-        self.check_input_f32(arg1s, arg2s)?;
+        let res_cnt = arg.len();
 
         // In q1.15 mode, 1 write/read to access 2 arguments/results
         self.peri.set_argument_count(AccessCount::One);
@@ -488,83 +379,53 @@ impl<'d, T: Instance> Cordic<'d, T> {
 
         self.peri.set_data_width(Width::Bits16, Width::Bits16);
 
-        let mut output_count = 0;
+        // To use cordic preload function, the first value is special.
+        // It is loaded to CORDIC WDATA register out side of loop
+        let first_value = arg[0];
 
-        // In q1.15 mode, we always fill 1 pair of 16bit value into WDATA register.
-        // If arg2s is None or empty array, we assume arg2 value always 1.0 (as reset value for ARG2).
-        // If arg2s has some value, and but not as long as arg1s,
-        // we fill the reset of arg2 values with last value from arg2s (as q1.31 version does)
+        // preload 1st value to CORDIC, to start the CORDIC calc
+        self.peri.write_argument(first_value);
 
-        let arg2_default_value = match arg2s {
-            Some(arg2s) if !arg2s.is_empty() => arg2s[arg2s.len() - 1],
-            _ => 1.0,
-        };
+        let mut cnt = 0;
 
-        let mut args = arg1s.iter().zip(
-            arg2s
-                .unwrap_or(&[])
-                .iter()
-                .chain(core::iter::repeat(&arg2_default_value)),
-        );
+        for &arg_val in &arg[1..] {
+            // preload arg_val (for next calc)
+            self.peri.write_argument(arg_val);
 
-        let (&arg1, &arg2) = args.next().unwrap();
-
-        // preloading 1 pair of arguments
-        self.blocking_write_f32(arg1, arg2)?;
-
-        for (&arg1, &arg2) in args {
-            self.blocking_write_f32(arg1, arg2)?;
-            self.blocking_read_f32_to_buf(output, &mut output_count);
+            // then read current result out
+            res[cnt] = self.peri.read_result();
+            cnt += 1;
         }
 
-        // read last pair of value from cordic
-        self.blocking_read_f32_to_buf(output, &mut output_count);
+        // read last result out
+        res[cnt] = self.peri.read_result();
+        // cnt += 1;
 
-        Ok(output_count)
+        Ok(res_cnt)
     }
 
-    fn blocking_write_f32(&mut self, arg1: f32, arg2: f32) -> Result<(), NumberOutOfRange> {
-        self.peri.write_argument(utils::f32_args_to_u32(arg1, arg2)?);
-        Ok(())
-    }
-
-    fn blocking_read_f32_to_buf(&mut self, result_buf: &mut [f32], result_index: &mut usize) {
-        let (res1, res2) = utils::u32_to_f32_res(self.peri.read_result());
-
-        result_buf[*result_index] = res1;
-        *result_index += 1;
-
-        // We don't care about whether the function return 1 or 2 results,
-        // the only thing matter is whether user want 1 or 2 results.
-        if !self.config.res1_only {
-            result_buf[*result_index] = res2;
-            *result_index += 1;
-        }
-    }
-
-    /// Run a async CORDIC calculation in q1.15 format
+    /// Run a async CORDIC calculation in q1.15 format  
+    ///
+    /// Notice::  
+    /// User will take respond to merge two u16 arguments into one u32 data, and/or split one u32 data into two u16 results.
     pub async fn async_calc_16bit(
         &mut self,
         write_dma: impl Peripheral<P = impl WriteDma<T>>,
         read_dma: impl Peripheral<P = impl ReadDma<T>>,
-        arg1s: &[f32],
-        arg2s: Option<&[f32]>,
-        output: &mut [f32],
+        arg: &[u32],
+        res: &mut [u32],
     ) -> Result<usize, CordicError> {
-        if arg1s.is_empty() {
+        if arg.is_empty() {
             return Ok(0);
         }
 
-        let output_length_enough = match self.config.res1_only {
-            true => output.len() >= arg1s.len(),
-            false => output.len() >= 2 * arg1s.len(),
-        };
-
-        if !output_length_enough {
-            return Err(CordicError::OutputLengthNotEnough);
+        if arg.len() > res.len() {
+            return Err(CordicError::ResultLengthNotEnough);
         }
 
-        self.check_input_f32(arg1s, arg2s)?;
+        let res_cnt = arg.len();
+
+        let active_res_buf = &mut res[..res_cnt];
 
         into_ref!(write_dma, read_dma);
 
@@ -574,142 +435,96 @@ impl<'d, T: Instance> Cordic<'d, T> {
 
         self.peri.set_data_width(Width::Bits16, Width::Bits16);
 
-        let mut output_count = 0;
-        let mut input_buf = [0u32; INPUT_BUF_MAX_LEN];
-        let mut input_buf_len = 0;
+        let write_req = write_dma.request();
+        let read_req = read_dma.request();
 
-        // In q1.15 mode, we always fill 1 pair of 16bit value into WDATA register.
-        // If arg2s is None or empty array, we assume arg2 value always 1.0 (as reset value for ARG2).
-        // If arg2s has some value, and but not as long as arg1s,
-        // we fill the reset of arg2 values with last value from arg2s (as CORDIC behavior on q1.31 format)
+        self.peri.enable_write_dma();
+        self.peri.enable_read_dma();
 
-        let arg2_default_value = match arg2s {
-            Some(arg2s) if !arg2s.is_empty() => arg2s[arg2s.len() - 1],
-            _ => 1.0,
-        };
+        let _on_drop = OnDrop::new(|| {
+            self.peri.disable_write_dma();
+            self.peri.disable_read_dma();
+        });
 
-        let args = arg1s.iter().zip(
-            arg2s
-                .unwrap_or(&[])
-                .iter()
-                .chain(core::iter::repeat(&arg2_default_value)),
-        );
-
-        for (&arg1, &arg2) in args {
-            input_buf[input_buf_len] = utils::f32_args_to_u32(arg1, arg2)?;
-            input_buf_len += 1;
-
-            if input_buf_len == INPUT_BUF_MAX_LEN {
-                self.inner_dma_calc_16bit(&mut write_dma, &mut read_dma, &input_buf, output, &mut output_count)
-                    .await;
-            }
-        }
-
-        if input_buf_len > 0 {
-            self.inner_dma_calc_16bit(
+        unsafe {
+            let write_transfer = dma::Transfer::new_write(
                 &mut write_dma,
+                write_req,
+                arg,
+                T::regs().wdata().as_ptr() as *mut _,
+                Default::default(),
+            );
+
+            let read_transfer = dma::Transfer::new_read(
                 &mut read_dma,
-                &input_buf[..input_buf_len],
-                output,
-                &mut output_count,
-            )
-            .await;
+                read_req,
+                T::regs().rdata().as_ptr() as *mut _,
+                active_res_buf,
+                Default::default(),
+            );
+
+            embassy_futures::join::join(write_transfer, read_transfer).await;
         }
 
-        Ok(output_count)
-    }
-
-    // this function is highly coupled with async_calc_16bit, and is not intended to use in other place
-    async fn inner_dma_calc_16bit(
-        &mut self,
-        write_dma: impl Peripheral<P = impl WriteDma<T>>,
-        read_dma: impl Peripheral<P = impl ReadDma<T>>,
-        input_buf: &[u32],              // input_buf, its content should be exact length for calculation
-        output: &mut [f32],             // caller should uses this buf as a final output array
-        output_start_index: &mut usize, // the index of start point of the output for this round of calculation
-    ) {
-        // output_buf is the place to store raw value from CORDIC (via DMA).
-        let mut output_buf = [0u32; INPUT_BUF_MAX_LEN];
-
-        let active_output_buf = &mut output_buf[..input_buf.len()];
-
-        self.launch_a_dma_transfer(write_dma, read_dma, input_buf, active_output_buf)
-            .await;
-
-        for &mut output_u32 in active_output_buf {
-            let (res1, res2) = utils::u32_to_f32_res(output_u32);
-
-            output[*output_start_index] = res1;
-            *output_start_index += 1;
-
-            if !self.config.res1_only {
-                output[*output_start_index] = res2;
-                *output_start_index += 1;
-            }
-        }
+        Ok(res_cnt)
     }
 }
 
-// check input value ARG1, ARG2, SCALE and FUNCTION are compatible with each other
-macro_rules! check_input_value {
-    ($func_name:ident, $float_type:ty) => {
+macro_rules! check_arg_value {
+    ($func_arg1_name:ident, $func_arg2_name:ident, $float_type:ty) => {
         impl<'d, T: Instance> Cordic<'d, T> {
-            fn $func_name(&self, arg1s: &[$float_type], arg2s: Option<&[$float_type]>) -> Result<(), ArgError> {
+            /// check input value ARG1, SCALE and FUNCTION are compatible with each other
+            pub fn $func_arg1_name(&self, arg: $float_type) -> Result<(), ArgError> {
                 let config = &self.config;
 
                 use Function::*;
 
                 struct Arg1ErrInfo {
                     scale: Option<Scale>,
-                    range: [f32; 2],
+                    range: [f32; 2], // f32 is ok, it only used in error display
                     inclusive_upper_bound: bool,
                 }
 
-                // check ARG1 value
                 let err_info = match config.function {
-                    Cos | Sin | Phase | Modulus | Arctan if arg1s.iter().any(|v| !(-1.0..=1.0).contains(v)) => {
-                        Some(Arg1ErrInfo {
-                            scale: None,
-                            range: [-1.0, 1.0],
-                            inclusive_upper_bound: true,
-                        })
-                    }
+                    Cos | Sin | Phase | Modulus | Arctan if !(-1.0..=1.0).contains(arg) => Some(Arg1ErrInfo {
+                        scale: None,
+                        range: [-1.0, 1.0],
+                        inclusive_upper_bound: true,
+                    }),
 
-                    Cosh | Sinh if arg1s.iter().any(|v| !(-0.559..=0.559).contains(v)) => Some(Arg1ErrInfo {
+                    Cosh | Sinh if !(-0.559..=0.559).contains(arg) => Some(Arg1ErrInfo {
                         scale: None,
                         range: [-0.559, 0.559],
                         inclusive_upper_bound: true,
                     }),
 
-                    Arctanh if arg1s.iter().any(|v| !(-0.403..=0.403).contains(v)) => Some(Arg1ErrInfo {
+                    Arctanh if !(-0.403..=0.403).contains(arg) => Some(Arg1ErrInfo {
                         scale: None,
                         range: [-0.403, 0.403],
                         inclusive_upper_bound: true,
                     }),
 
                     Ln => match config.scale {
-                        Scale::Arg1o2Res2 if arg1s.iter().any(|v| !(0.0535..0.5).contains(v)) => Some(Arg1ErrInfo {
+                        Scale::Arg1o2Res2 if !(0.0535..0.5).contains(arg) => Some(Arg1ErrInfo {
                             scale: Some(Scale::Arg1o2Res2),
                             range: [0.0535, 0.5],
                             inclusive_upper_bound: false,
                         }),
-                        Scale::Arg1o4Res4 if arg1s.iter().any(|v| !(0.25..0.75).contains(v)) => Some(Arg1ErrInfo {
+                        Scale::Arg1o4Res4 if !(0.25..0.75).contains(arg) => Some(Arg1ErrInfo {
                             scale: Some(Scale::Arg1o4Res4),
                             range: [0.25, 0.75],
                             inclusive_upper_bound: false,
                         }),
-                        Scale::Arg1o8Res8 if arg1s.iter().any(|v| !(0.375..0.875).contains(v)) => Some(Arg1ErrInfo {
+                        Scale::Arg1o8Res8 if !(0.375..0.875).contains(arg) => Some(Arg1ErrInfo {
                             scale: Some(Scale::Arg1o8Res8),
                             range: [0.375, 0.875],
                             inclusive_upper_bound: false,
                         }),
-                        Scale::Arg1o16Res16 if arg1s.iter().any(|v| !(0.4375..0.584).contains(v)) => {
-                            Some(Arg1ErrInfo {
-                                scale: Some(Scale::Arg1o16Res16),
-                                range: [0.4375, 0.584],
-                                inclusive_upper_bound: false,
-                            })
-                        }
+                        Scale::Arg1o16Res16 if !(0.4375..0.584).contains(arg) => Some(Arg1ErrInfo {
+                            scale: Some(Scale::Arg1o16Res16),
+                            range: [0.4375, 0.584],
+                            inclusive_upper_bound: false,
+                        }),
 
                         Scale::Arg1o2Res2 | Scale::Arg1o4Res4 | Scale::Arg1o8Res8 | Scale::Arg1o16Res16 => None,
 
@@ -717,17 +532,17 @@ macro_rules! check_input_value {
                     },
 
                     Sqrt => match config.scale {
-                        Scale::Arg1Res1 if arg1s.iter().any(|v| !(0.027..0.75).contains(v)) => Some(Arg1ErrInfo {
+                        Scale::Arg1Res1 if !(0.027..0.75).contains(arg) => Some(Arg1ErrInfo {
                             scale: Some(Scale::Arg1Res1),
                             range: [0.027, 0.75],
                             inclusive_upper_bound: false,
                         }),
-                        Scale::Arg1o2Res2 if arg1s.iter().any(|v| !(0.375..0.875).contains(v)) => Some(Arg1ErrInfo {
+                        Scale::Arg1o2Res2 if !(0.375..0.875).contains(arg) => Some(Arg1ErrInfo {
                             scale: Some(Scale::Arg1o2Res2),
                             range: [0.375, 0.875],
                             inclusive_upper_bound: false,
                         }),
-                        Scale::Arg1o4Res4 if arg1s.iter().any(|v| !(0.4375..0.584).contains(v)) => Some(Arg1ErrInfo {
+                        Scale::Arg1o4Res4 if !(0.4375..0.584).contains(arg) => Some(Arg1ErrInfo {
                             scale: Some(Scale::Arg1o4Res4),
                             range: [0.4375, 0.584],
                             inclusive_upper_bound: false,
@@ -749,33 +564,35 @@ macro_rules! check_input_value {
                     });
                 }
 
-                // check ARG2 value
-                if let Some(arg2s) = arg2s {
-                    struct Arg2ErrInfo {
-                        range: [f32; 2],
-                    }
+                Ok(())
+            }
 
-                    let err_info = match config.function {
-                        Cos | Sin if arg2s.iter().any(|v| !(0.0..=1.0).contains(v)) => {
-                            Some(Arg2ErrInfo { range: [0.0, 1.0] })
-                        }
+            /// check input value ARG2 and FUNCTION are compatible with each other
+            pub fn $func_arg2_name(&self, arg: $float_type) -> Result<(), ArgError> {
+                let config = &self.config;
 
-                        Phase | Modulus if arg2s.iter().any(|v| !(-1.0..=1.0).contains(v)) => {
-                            Some(Arg2ErrInfo { range: [-1.0, 1.0] })
-                        }
+                use Function::*;
 
-                        Cos | Sin | Phase | Modulus | Arctan | Cosh | Sinh | Arctanh | Ln | Sqrt => None,
-                    };
+                struct Arg2ErrInfo {
+                    range: [f32; 2], // f32 is ok, it only used in error display
+                }
 
-                    if let Some(err) = err_info {
-                        return Err(ArgError {
-                            func: config.function,
-                            scale: None,
-                            arg_range: err.range,
-                            inclusive_upper_bound: true,
-                            arg_type: ArgType::Arg2,
-                        });
-                    }
+                let err_info = match config.function {
+                    Cos | Sin if !(0.0..=1.0).contains(arg) => Some(Arg2ErrInfo { range: [0.0, 1.0] }),
+
+                    Phase | Modulus if !(-1.0..=1.0).contains(arg) => Some(Arg2ErrInfo { range: [-1.0, 1.0] }),
+
+                    Cos | Sin | Phase | Modulus | Arctan | Cosh | Sinh | Arctanh | Ln | Sqrt => None,
+                };
+
+                if let Some(err) = err_info {
+                    return Err(ArgError {
+                        func: config.function,
+                        scale: None,
+                        arg_range: err.range,
+                        inclusive_upper_bound: true,
+                        arg_type: ArgType::Arg2,
+                    });
                 }
 
                 Ok(())
@@ -784,8 +601,8 @@ macro_rules! check_input_value {
     };
 }
 
-check_input_value!(check_input_f64, f64);
-check_input_value!(check_input_f32, f32);
+check_arg_value!(check_f64_arg1, check_f64_arg2, &f64);
+check_arg_value!(check_f32_arg1, check_f32_arg2, &f32);
 
 foreach_interrupt!(
     ($inst:ident, cordic, $block:ident, GLOBAL, $irq:ident) => {
diff --git a/embassy-stm32/src/cordic/utils.rs b/embassy-stm32/src/cordic/utils.rs
index 41821d6e2..008f50270 100644
--- a/embassy-stm32/src/cordic/utils.rs
+++ b/embassy-stm32/src/cordic/utils.rs
@@ -1,4 +1,4 @@
-//! Common match utils
+//! Common math utils
 use super::errors::NumberOutOfRange;
 
 macro_rules! floating_fixed_convert {
@@ -60,16 +60,3 @@ floating_fixed_convert!(
     15,
     0x3800_0000u32 // binary form of 1f32^(-15)
 );
-
-#[inline(always)]
-pub(crate) fn f32_args_to_u32(arg1: f32, arg2: f32) -> Result<u32, NumberOutOfRange> {
-    Ok(f32_to_q1_15(arg1)? as u32 + ((f32_to_q1_15(arg2)? as u32) << 16))
-}
-
-#[inline(always)]
-pub(crate) fn u32_to_f32_res(reg_value: u32) -> (f32, f32) {
-    let res1 = q1_15_to_f32((reg_value & ((1u32 << 16) - 1)) as u16);
-    let res2 = q1_15_to_f32((reg_value >> 16) as u16);
-
-    (res1, res2)
-}
diff --git a/examples/stm32h5/src/bin/cordic.rs b/examples/stm32h5/src/bin/cordic.rs
index d49f75b8f..73e873574 100644
--- a/examples/stm32h5/src/bin/cordic.rs
+++ b/examples/stm32h5/src/bin/cordic.rs
@@ -3,7 +3,7 @@
 
 use defmt::*;
 use embassy_executor::Spawner;
-use embassy_stm32::cordic;
+use embassy_stm32::cordic::{self, utils};
 use {defmt_rtt as _, panic_probe as _};
 
 #[embassy_executor::main]
@@ -16,20 +16,63 @@ async fn main(_spawner: Spawner) {
             cordic::Function::Sin,
             Default::default(),
             Default::default(),
-            false,
         )),
     );
 
-    let mut output = [0f64; 16];
+    // for output buf, the length is not that strict, larger than minimal required is ok.
+    let mut output_f64 = [0f64; 19];
+    let mut output_u32 = [0u32; 21];
 
-    let arg1 = [1.0, 0.0, -1.0]; // for trigonometric function, the ARG1 value [-pi, pi] should be map to [-1, 1]
-    let arg2 = [0.5, 1.0];
+    // tips:
+    // CORDIC peripheral has some strict on input value, you can also use ".check_argX_fXX()" methods
+    // to make sure your input values are compatible with current CORDIC setup.
+    let arg1 = [-1.0, -0.5, 0.0, 0.5, 1.0]; // for trigonometric function, the ARG1 value [-pi, pi] should be map to [-1, 1]
+    let arg2 = [0.5]; // and for Sin function, ARG2 should be in [0, 1]
 
-    let cnt = unwrap!(
+    let mut input_buf = [0u32; 9];
+
+    // convert input from floating point to fixed point
+    input_buf[0] = unwrap!(utils::f64_to_q1_31(arg1[0]));
+    input_buf[1] = unwrap!(utils::f64_to_q1_31(arg2[0]));
+
+    // If input length is small, blocking mode can be used to minimize overhead.
+    let cnt0 = unwrap!(cordic.blocking_calc_32bit(
+        &input_buf[..2], // input length is strict, since driver use its length to detect calculation count
+        &mut output_u32,
+        false,
+        false
+    ));
+
+    // convert result from fixed point into floating point
+    for (&u32_val, f64_val) in output_u32[..cnt0].iter().zip(output_f64.iter_mut()) {
+        *f64_val = utils::q1_31_to_f64(u32_val);
+    }
+
+    // convert input from floating point to fixed point
+    //
+    // first value from arg1 is used, so truncate to arg1[1..]
+    for (&f64_val, u32_val) in arg1[1..].iter().zip(input_buf.iter_mut()) {
+        *u32_val = unwrap!(utils::f64_to_q1_31(f64_val));
+    }
+
+    // If calculation is a little longer, async mode can make use of DMA, and let core do some other stuff.
+    let cnt1 = unwrap!(
         cordic
-            .async_calc_32bit(&mut dp.GPDMA1_CH0, &mut dp.GPDMA1_CH1, &arg1, Some(&arg2), &mut output,)
+            .async_calc_32bit(
+                &mut dp.GPDMA1_CH0,
+                &mut dp.GPDMA1_CH1,
+                &input_buf[..arg1.len() - 1], // limit input buf to its actual length
+                &mut output_u32,
+                true,
+                false
+            )
             .await
     );
 
-    println!("async calc 32bit: {}", output[..cnt]);
+    // convert result from fixed point into floating point
+    for (&u32_val, f64_val) in output_u32[..cnt1].iter().zip(output_f64[cnt0..cnt0 + cnt1].iter_mut()) {
+        *f64_val = utils::q1_31_to_f64(u32_val);
+    }
+
+    println!("result: {}", output_f64[..cnt0 + cnt1]);
 }
diff --git a/tests/stm32/src/bin/cordic.rs b/tests/stm32/src/bin/cordic.rs
index cd2e9d6f7..669fd96ab 100644
--- a/tests/stm32/src/bin/cordic.rs
+++ b/tests/stm32/src/bin/cordic.rs
@@ -14,6 +14,7 @@
 mod common;
 use common::*;
 use embassy_executor::Spawner;
+use embassy_stm32::cordic::utils;
 use embassy_stm32::{bind_interrupts, cordic, peripherals, rng};
 use num_traits::Float;
 use {defmt_rtt as _, panic_probe as _};
@@ -24,11 +25,12 @@ bind_interrupts!(struct Irqs {
 
 /* input value control, can be changed */
 
-const ARG1_LENGTH: usize = 9;
-const ARG2_LENGTH: usize = 4; // this might not be the exact length of ARG2, since ARG2 need to be inside [0, 1]
+const INPUT_U32_COUNT: usize = 9;
+const INPUT_U8_COUNT: usize = 4 * INPUT_U32_COUNT;
 
-const INPUT_Q1_31_LENGTH: usize = ARG1_LENGTH + ARG2_LENGTH;
-const INPUT_U8_LENGTH: usize = 4 * INPUT_Q1_31_LENGTH;
+// Assume first calculation needs 2 arguments, the reset needs 1 argument.
+// And all calculation generate 2 results.
+const OUTPUT_LENGTH: usize = (INPUT_U32_COUNT - 1) * 2;
 
 #[embassy_executor::main]
 async fn main(_spawner: Spawner) {
@@ -42,43 +44,28 @@ async fn main(_spawner: Spawner) {
 
     let mut rng = rng::Rng::new(dp.RNG, Irqs);
 
-    let mut input_buf_u8 = [0u8; INPUT_U8_LENGTH];
+    let mut input_buf_u8 = [0u8; INPUT_U8_COUNT];
     defmt::unwrap!(rng.async_fill_bytes(&mut input_buf_u8).await);
 
     // convert every [u8; 4] to a u32, for a Q1.31 value
-    let input_q1_31 = unsafe { core::mem::transmute::<[u8; INPUT_U8_LENGTH], [u32; INPUT_Q1_31_LENGTH]>(input_buf_u8) };
+    let mut input_q1_31 = unsafe { core::mem::transmute::<[u8; INPUT_U8_COUNT], [u32; INPUT_U32_COUNT]>(input_buf_u8) };
 
-    let mut input_f64_buf = [0f64; INPUT_Q1_31_LENGTH];
+    // ARG2 for Sin function should be inside [0, 1], set MSB to 0 of a Q1.31 value, will make sure it's no less than 0.
+    input_q1_31[1] &= !(1u32 << 31);
 
-    let mut cordic_output_f64_buf = [0f64; ARG1_LENGTH * 2];
+    //
+    // CORDIC calculation
+    //
 
-    // convert Q1.31 value back to f64, for software calculation verify
-    for (val_u32, val_f64) in input_q1_31.iter().zip(input_f64_buf.iter_mut()) {
-        *val_f64 = cordic::utils::q1_31_to_f64(*val_u32);
-    }
-
-    let mut arg2_f64_buf = [0f64; ARG2_LENGTH];
-    let mut arg2_f64_len = 0;
-
-    // check if ARG2 is in range [0, 1] (limited by CORDIC peripheral with Sin mode)
-    for &arg2 in &input_f64_buf[ARG1_LENGTH..] {
-        if arg2 >= 0.0 {
-            arg2_f64_buf[arg2_f64_len] = arg2;
-            arg2_f64_len += 1;
-        }
-    }
-
-    // the actual value feed to CORDIC
-    let arg1_f64_ls = &input_f64_buf[..ARG1_LENGTH];
-    let arg2_f64_ls = &arg2_f64_buf[..arg2_f64_len];
+    let mut output_q1_31 = [0u32; OUTPUT_LENGTH];
 
+    // setup Cordic driver
     let mut cordic = cordic::Cordic::new(
         dp.CORDIC,
         defmt::unwrap!(cordic::Config::new(
             cordic::Function::Sin,
             Default::default(),
             Default::default(),
-            false,
         )),
     );
 
@@ -88,67 +75,66 @@ async fn main(_spawner: Spawner) {
     #[cfg(any(feature = "stm32h563zi", feature = "stm32u585ai", feature = "stm32u5a5zj"))]
     let (mut write_dma, mut read_dma) = (dp.GPDMA1_CH4, dp.GPDMA1_CH5);
 
-    let cordic_start_point = embassy_time::Instant::now();
+    // calculate first result using blocking mode
+    let cnt0 = defmt::unwrap!(cordic.blocking_calc_32bit(&input_q1_31[..2], &mut output_q1_31, false, false));
 
-    let cnt = unwrap!(
+    // calculate rest results using async mode
+    let cnt1 = defmt::unwrap!(
         cordic
             .async_calc_32bit(
                 &mut write_dma,
                 &mut read_dma,
-                arg1_f64_ls,
-                Some(arg2_f64_ls),
-                &mut cordic_output_f64_buf,
+                &input_q1_31[2..],
+                &mut output_q1_31[cnt0..],
+                true,
+                false,
             )
             .await
     );
 
-    let cordic_end_point = embassy_time::Instant::now();
+    // all output value length should be the same as our output buffer size
+    defmt::assert_eq!(cnt0 + cnt1, output_q1_31.len());
 
-    // since we get 2 output for 1 calculation, the output length should be ARG1_LENGTH * 2
-    defmt::assert!(cnt == ARG1_LENGTH * 2);
+    let mut cordic_result_f64 = [0.0f64; OUTPUT_LENGTH];
 
-    let mut software_output_f64_buf = [0f64; ARG1_LENGTH * 2];
+    for (f64_val, u32_val) in cordic_result_f64.iter_mut().zip(output_q1_31) {
+        *f64_val = utils::q1_31_to_f64(u32_val);
+    }
 
-    // for software calc, if there is no ARG2 value, insert a 1.0 as value (the reset value for ARG2 in CORDIC)
-    let arg2_f64_ls = if arg2_f64_len == 0 { &[1.0] } else { arg2_f64_ls };
+    //
+    // software calculation
+    //
 
-    let software_inputs = arg1_f64_ls
+    let mut software_result_f64 = [0.0f64; OUTPUT_LENGTH];
+
+    let arg2 = utils::q1_31_to_f64(input_q1_31[1]);
+
+    for (&arg1, res) in input_q1_31
         .iter()
-        .zip(
-            arg2_f64_ls
-                .iter()
-                .chain(core::iter::repeat(&arg2_f64_ls[arg2_f64_ls.len() - 1])),
-        )
-        .zip(software_output_f64_buf.chunks_mut(2));
+        .enumerate()
+        .filter_map(|(idx, val)| if idx != 1 { Some(val) } else { None })
+        .zip(software_result_f64.chunks_mut(2))
+    {
+        let arg1 = utils::q1_31_to_f64(arg1);
 
-    let software_start_point = embassy_time::Instant::now();
-
-    for ((arg1, arg2), res) in software_inputs {
         let (raw_res1, raw_res2) = (arg1 * core::f64::consts::PI).sin_cos();
-
         (res[0], res[1]) = (raw_res1 * arg2, raw_res2 * arg2);
     }
 
-    let software_end_point = embassy_time::Instant::now();
+    //
+    // check result are the same
+    //
 
-    for (cordic_res, software_res) in cordic_output_f64_buf[..cnt]
+    for (cordic_res, software_res) in cordic_result_f64[..cnt0 + cnt1]
         .chunks(2)
-        .zip(software_output_f64_buf.chunks(2))
+        .zip(software_result_f64.chunks(2))
     {
         for (cord_res, soft_res) in cordic_res.iter().zip(software_res.iter()) {
+            // 2.0.powi(-19) is the max residual error for Sin function, in q1.31 format, with 24 iterations (aka PRECISION = 6)
             defmt::assert!((cord_res - soft_res).abs() <= 2.0.powi(-19));
         }
     }
 
-    // This comparison is just for fun. Since it not a equal compare:
-    // software use 64-bit floating point, but CORDIC use 32-bit fixed point.
-    defmt::trace!(
-        "calculate count: {}, Cordic time: {} us, software time: {} us",
-        ARG1_LENGTH,
-        (cordic_end_point - cordic_start_point).as_micros(),
-        (software_end_point - software_start_point).as_micros()
-    );
-
     info!("Test OK");
     cortex_m::asm::bkpt();
 }

From 8fa1d06a6a7bf3d00ac87319ac71953237535c43 Mon Sep 17 00:00:00 2001
From: eZio Pan <eziopan@qq.com>
Date: Sat, 23 Mar 2024 09:04:09 +0800
Subject: [PATCH 15/17] stm32 CORDIC: use private_bounds for sealed traits.

---
 embassy-stm32/src/cordic/mod.rs    | 16 ++++++----------
 embassy-stm32/src/cordic/sealed.rs |  2 +-
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/embassy-stm32/src/cordic/mod.rs b/embassy-stm32/src/cordic/mod.rs
index 2479e1b27..6bbc48f2b 100644
--- a/embassy-stm32/src/cordic/mod.rs
+++ b/embassy-stm32/src/cordic/mod.rs
@@ -11,16 +11,11 @@ pub use enums::*;
 mod errors;
 pub use errors::*;
 
+mod sealed;
+use self::sealed::SealedInstance;
+
 pub mod utils;
 
-pub(crate) mod sealed;
-
-/// Low-level CORDIC access.
-#[cfg(feature = "unstable-pac")]
-pub mod low_level {
-    pub use super::sealed::*;
-}
-
 /// CORDIC driver
 pub struct Cordic<'d, T: Instance> {
     peri: PeripheralRef<'d, T>,
@@ -28,7 +23,8 @@ pub struct Cordic<'d, T: Instance> {
 }
 
 /// CORDIC instance trait
-pub trait Instance: sealed::Instance + Peripheral<P = Self> + crate::rcc::RccPeripheral {}
+#[allow(private_bounds)]
+pub trait Instance: SealedInstance + Peripheral<P = Self> + crate::rcc::RccPeripheral {}
 
 /// CORDIC configuration
 #[derive(Debug)]
@@ -609,7 +605,7 @@ foreach_interrupt!(
         impl Instance for peripherals::$inst {
         }
 
-        impl sealed::Instance for peripherals::$inst {
+        impl SealedInstance for peripherals::$inst {
             fn regs() -> crate::pac::cordic::Cordic {
                 crate::pac::$inst
             }
diff --git a/embassy-stm32/src/cordic/sealed.rs b/embassy-stm32/src/cordic/sealed.rs
index f9521ff7a..8f0bd1830 100644
--- a/embassy-stm32/src/cordic/sealed.rs
+++ b/embassy-stm32/src/cordic/sealed.rs
@@ -2,7 +2,7 @@ use super::*;
 use crate::pac::cordic::vals;
 
 /// Cordic instance
-pub trait Instance {
+pub(super) trait SealedInstance {
     /// Get access to CORDIC registers
     fn regs() -> crate::pac::cordic::Cordic;
 

From 79eabc95aa3b627dcfec92491979b433e25a09ba Mon Sep 17 00:00:00 2001
From: eZio Pan <eziopan@qq.com>
Date: Sat, 23 Mar 2024 09:53:19 +0800
Subject: [PATCH 16/17] stm32 CORDIC: add g491re back to cordic test

---
 tests/stm32/Cargo.toml        |  2 +-
 tests/stm32/src/bin/cordic.rs | 11 +++--------
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/tests/stm32/Cargo.toml b/tests/stm32/Cargo.toml
index 345c72a03..b19af98a0 100644
--- a/tests/stm32/Cargo.toml
+++ b/tests/stm32/Cargo.toml
@@ -14,7 +14,7 @@ stm32f429zi = ["embassy-stm32/stm32f429zi", "chrono", "eth", "stop", "can", "not
 stm32f446re = ["embassy-stm32/stm32f446re", "chrono", "stop", "can", "not-gpdma", "dac", "sdmmc"]
 stm32f767zi = ["embassy-stm32/stm32f767zi", "chrono", "not-gpdma", "eth", "rng"]
 stm32g071rb = ["embassy-stm32/stm32g071rb", "cm0", "not-gpdma", "dac", "ucpd"]
-stm32g491re = ["embassy-stm32/stm32g491re", "chrono", "stop", "not-gpdma", "rng", "fdcan"]
+stm32g491re = ["embassy-stm32/stm32g491re", "chrono", "stop", "not-gpdma", "rng", "fdcan", "cordic"]
 stm32h563zi = ["embassy-stm32/stm32h563zi", "chrono", "eth", "rng", "hash", "cordic"]
 stm32h753zi = ["embassy-stm32/stm32h753zi", "chrono", "not-gpdma", "eth", "rng", "fdcan", "hash", "cryp"]
 stm32h755zi = ["embassy-stm32/stm32h755zi-cm7", "chrono", "not-gpdma", "eth", "dac", "rng", "fdcan", "hash", "cryp"]
diff --git a/tests/stm32/src/bin/cordic.rs b/tests/stm32/src/bin/cordic.rs
index 669fd96ab..400e10207 100644
--- a/tests/stm32/src/bin/cordic.rs
+++ b/tests/stm32/src/bin/cordic.rs
@@ -2,11 +2,6 @@
 
 // Test Cordic driver, with Q1.31 format, Sin function, at 24 iterations (aka PRECISION = 6), using DMA transfer
 
-// Only test on STM32H563ZI, STM32U585AI and STM32U5a5JI.
-// STM32G491RE is not tested, since it memory.x has less memory size than it actually has,
-// and the test seems use more memory than memory.x suggest.
-// see https://github.com/embassy-rs/stm32-data/issues/301#issuecomment-1925412561
-
 #![no_std]
 #![no_main]
 
@@ -69,11 +64,11 @@ async fn main(_spawner: Spawner) {
         )),
     );
 
-    //#[cfg(feature = "stm32g491re")]
-    //let (mut write_dma, mut read_dma) = (dp.DMA1_CH4, dp.DMA1_CH5);
+    #[cfg(feature = "stm32g491re")]
+    let (mut write_dma, mut read_dma) = (dp.DMA1_CH4, dp.DMA1_CH5);
 
     #[cfg(any(feature = "stm32h563zi", feature = "stm32u585ai", feature = "stm32u5a5zj"))]
-    let (mut write_dma, mut read_dma) = (dp.GPDMA1_CH4, dp.GPDMA1_CH5);
+    let (mut write_dma, mut read_dma) = (dp.GPDMA1_CH0, dp.GPDMA1_CH1);
 
     // calculate first result using blocking mode
     let cnt0 = defmt::unwrap!(cordic.blocking_calc_32bit(&input_q1_31[..2], &mut output_q1_31, false, false));

From 6b2e15e318c05a66f17575cde4987353a52108a4 Mon Sep 17 00:00:00 2001
From: eZio Pan <eziopan@qq.com>
Date: Tue, 26 Mar 2024 14:43:09 +0800
Subject: [PATCH 17/17] stm32 CORDIC: exclude stm32u5a

---
 embassy-stm32/build.rs   | 9 +++++++--
 embassy-stm32/src/lib.rs | 5 +++--
 tests/stm32/Cargo.toml   | 2 +-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/embassy-stm32/build.rs b/embassy-stm32/build.rs
index e224cc5a2..057c4cee2 100644
--- a/embassy-stm32/build.rs
+++ b/embassy-stm32/build.rs
@@ -1139,13 +1139,18 @@ fn main() {
         (("timer", "CH2"), quote!(crate::timer::Ch2Dma)),
         (("timer", "CH3"), quote!(crate::timer::Ch3Dma)),
         (("timer", "CH4"), quote!(crate::timer::Ch4Dma)),
-        (("cordic", "WRITE"), quote!(crate::cordic::WriteDma)),
-        (("cordic", "READ"), quote!(crate::cordic::ReadDma)),
+        (("cordic", "WRITE"), quote!(crate::cordic::WriteDma)), // FIXME: stm32u5a crash on Cordic driver
+        (("cordic", "READ"), quote!(crate::cordic::ReadDma)),   // FIXME: stm32u5a crash on Cordic driver
     ]
     .into();
 
     for p in METADATA.peripherals {
         if let Some(regs) = &p.registers {
+            // FIXME: stm32u5a crash on Cordic driver
+            if chip_name.starts_with("stm32u5a") && regs.kind == "cordic" {
+                continue;
+            }
+
             let mut dupe = HashSet::new();
             for ch in p.dma_channels {
                 // Some chips have multiple request numbers for the same (peri, signal, channel) combos.
diff --git a/embassy-stm32/src/lib.rs b/embassy-stm32/src/lib.rs
index ae2e95435..dd4aef51e 100644
--- a/embassy-stm32/src/lib.rs
+++ b/embassy-stm32/src/lib.rs
@@ -32,7 +32,8 @@ pub mod timer;
 pub mod adc;
 #[cfg(can)]
 pub mod can;
-#[cfg(cordic)]
+// FIXME: Cordic driver cause stm32u5a5zj crash
+#[cfg(all(cordic, not(any(stm32u5a5, stm32u5a9))))]
 pub mod cordic;
 #[cfg(crc)]
 pub mod crc;
@@ -236,7 +237,7 @@ pub fn init(config: Config) -> Peripherals {
 
         #[cfg(dbgmcu)]
         crate::pac::DBGMCU.cr().modify(|cr| {
-            #[cfg(any(dbgmcu_h5))]
+            #[cfg(dbgmcu_h5)]
             {
                 cr.set_stop(config.enable_debug_during_sleep);
                 cr.set_standby(config.enable_debug_during_sleep);
diff --git a/tests/stm32/Cargo.toml b/tests/stm32/Cargo.toml
index b19af98a0..e09083111 100644
--- a/tests/stm32/Cargo.toml
+++ b/tests/stm32/Cargo.toml
@@ -26,7 +26,7 @@ stm32l4a6zg = ["embassy-stm32/stm32l4a6zg", "chrono", "not-gpdma", "rng", "hash"
 stm32l4r5zi = ["embassy-stm32/stm32l4r5zi", "chrono", "not-gpdma", "rng"]
 stm32l552ze = ["embassy-stm32/stm32l552ze", "not-gpdma", "rng", "hash"]
 stm32u585ai = ["embassy-stm32/stm32u585ai", "chrono", "rng", "hash", "cordic"]
-stm32u5a5zj = ["embassy-stm32/stm32u5a5zj", "chrono", "rng", "hash", "cordic"]
+stm32u5a5zj = ["embassy-stm32/stm32u5a5zj", "chrono", "rng", "hash"] # FIXME: cordic test cause it crash
 stm32wb55rg = ["embassy-stm32/stm32wb55rg", "chrono", "not-gpdma", "ble", "mac" , "rng"]
 stm32wba52cg = ["embassy-stm32/stm32wba52cg", "chrono", "rng", "hash"]
 stm32wl55jc = ["embassy-stm32/stm32wl55jc-cm4", "not-gpdma", "rng", "chrono"]