diff --git a/embassy-stm32/build.rs b/embassy-stm32/build.rs
index 058b8a0fc..de03827e9 100644
--- a/embassy-stm32/build.rs
+++ b/embassy-stm32/build.rs
@@ -1008,6 +1008,7 @@ fn main() {
         (("quadspi", "QUADSPI"), quote!(crate::qspi::QuadDma)),
         (("dac", "CH1"), quote!(crate::dac::DacDma1)),
         (("dac", "CH2"), quote!(crate::dac::DacDma2)),
+        (("timer", "UP"), quote!(crate::timer::UpDma)),
     ]
     .into();
 
@@ -1023,6 +1024,16 @@ fn main() {
                 }
 
                 if let Some(tr) = signals.get(&(regs.kind, ch.signal)) {
+                    // TIM6 of stm32f334 is special, DMA channel for TIM6 depending on SYSCFG state
+                    if chip_name.starts_with("stm32f334") && p.name == "TIM6" {
+                        continue;
+                    }
+
+                    // TIM6 of stm32f378 is special, DMA channel for TIM6 depending on SYSCFG state
+                    if chip_name.starts_with("stm32f378") && p.name == "TIM6" {
+                        continue;
+                    }
+
                     let peri = format_ident!("{}", p.name);
 
                     let channel = if let Some(channel) = &ch.channel {
diff --git a/embassy-stm32/src/timer/mod.rs b/embassy-stm32/src/timer/mod.rs
index 74120adad..d07fd2776 100644
--- a/embassy-stm32/src/timer/mod.rs
+++ b/embassy-stm32/src/timer/mod.rs
@@ -91,7 +91,17 @@ pub(crate) mod sealed {
 
         /// Enable/disable the update interrupt.
         fn enable_update_interrupt(&mut self, enable: bool) {
-            Self::regs().dier().write(|r| r.set_uie(enable));
+            Self::regs().dier().modify(|r| r.set_uie(enable));
+        }
+
+        /// Enable/disable the update dma.
+        fn enable_update_dma(&mut self, enable: bool) {
+            Self::regs().dier().modify(|r| r.set_ude(enable));
+        }
+
+        /// Get the update dma enable/disable state.
+        fn get_update_dma_state(&self) -> bool {
+            Self::regs().dier().read().ude()
         }
 
         /// Enable/disable autoreload preload.
@@ -269,6 +279,11 @@ pub(crate) mod sealed {
             Self::regs_gp16().ccer().modify(|w| w.set_cce(channel.index(), enable));
         }
 
+        /// Get enable/disable state of a channel
+        fn get_channel_enable_state(&self, channel: Channel) -> bool {
+            Self::regs_gp16().ccer().read().cce(channel.index())
+        }
+
         /// Set compare value for a channel.
         fn set_compare_value(&mut self, channel: Channel, value: u16) {
             Self::regs_gp16().ccr(channel.index()).modify(|w| w.set_ccr(value));
@@ -288,6 +303,14 @@ pub(crate) mod sealed {
         fn get_compare_value(&self, channel: Channel) -> u16 {
             Self::regs_gp16().ccr(channel.index()).read().ccr()
         }
+
+        /// Set output compare preload.
+        fn set_output_compare_preload(&mut self, channel: Channel, preload: bool) {
+            let channel_index = channel.index();
+            Self::regs_gp16()
+                .ccmr_output(channel_index / 2)
+                .modify(|w| w.set_ocpe(channel_index % 2, preload));
+        }
     }
 
     /// Capture/Compare 16-bit timer instance with complementary pin support.
@@ -535,13 +558,16 @@ impl From<OutputPolarity> for bool {
 pub trait Basic16bitInstance: sealed::Basic16bitInstance + 'static {}
 
 /// Gneral-purpose 16-bit timer instance.
-pub trait GeneralPurpose16bitInstance: sealed::GeneralPurpose16bitInstance + 'static {}
+pub trait GeneralPurpose16bitInstance: sealed::GeneralPurpose16bitInstance + Basic16bitInstance + 'static {}
 
 /// Gneral-purpose 32-bit timer instance.
-pub trait GeneralPurpose32bitInstance: sealed::GeneralPurpose32bitInstance + 'static {}
+pub trait GeneralPurpose32bitInstance:
+    sealed::GeneralPurpose32bitInstance + GeneralPurpose16bitInstance + 'static
+{
+}
 
 /// Advanced control timer instance.
-pub trait AdvancedControlInstance: sealed::AdvancedControlInstance + 'static {}
+pub trait AdvancedControlInstance: sealed::AdvancedControlInstance + GeneralPurpose16bitInstance + 'static {}
 
 /// Capture/Compare 16-bit timer instance.
 pub trait CaptureCompare16bitInstance:
@@ -551,7 +577,7 @@ pub trait CaptureCompare16bitInstance:
 
 /// Capture/Compare 16-bit timer instance with complementary pin support.
 pub trait ComplementaryCaptureCompare16bitInstance:
-    sealed::ComplementaryCaptureCompare16bitInstance + AdvancedControlInstance + 'static
+    sealed::ComplementaryCaptureCompare16bitInstance + CaptureCompare16bitInstance + AdvancedControlInstance + 'static
 {
 }
 
@@ -676,3 +702,6 @@ foreach_interrupt! {
         }
     };
 }
+
+// Update Event trigger DMA for every timer
+dma_trait!(UpDma, Basic16bitInstance);
diff --git a/embassy-stm32/src/timer/simple_pwm.rs b/embassy-stm32/src/timer/simple_pwm.rs
index e6072aa15..80f10424c 100644
--- a/embassy-stm32/src/timer/simple_pwm.rs
+++ b/embassy-stm32/src/timer/simple_pwm.rs
@@ -86,14 +86,13 @@ impl<'d, T: CaptureCompare16bitInstance> SimplePwm<'d, T> {
 
         this.inner.enable_outputs();
 
-        this.inner
-            .set_output_compare_mode(Channel::Ch1, OutputCompareMode::PwmMode1);
-        this.inner
-            .set_output_compare_mode(Channel::Ch2, OutputCompareMode::PwmMode1);
-        this.inner
-            .set_output_compare_mode(Channel::Ch3, OutputCompareMode::PwmMode1);
-        this.inner
-            .set_output_compare_mode(Channel::Ch4, OutputCompareMode::PwmMode1);
+        [Channel::Ch1, Channel::Ch2, Channel::Ch3, Channel::Ch4]
+            .iter()
+            .for_each(|&channel| {
+                this.inner.set_output_compare_mode(channel, OutputCompareMode::PwmMode1);
+                this.inner.set_output_compare_preload(channel, true)
+            });
+
         this
     }
 
@@ -107,6 +106,11 @@ impl<'d, T: CaptureCompare16bitInstance> SimplePwm<'d, T> {
         self.inner.enable_channel(channel, false);
     }
 
+    /// Check whether given channel is enabled
+    pub fn is_enabled(&self, channel: Channel) -> bool {
+        self.inner.get_channel_enable_state(channel)
+    }
+
     /// Set PWM frequency.
     ///
     /// Note: when you call this, the max duty value changes, so you will have to
@@ -135,10 +139,86 @@ impl<'d, T: CaptureCompare16bitInstance> SimplePwm<'d, T> {
         self.inner.set_compare_value(channel, duty)
     }
 
+    /// Get the duty for a given channel.
+    ///
+    /// The value ranges from 0 for 0% duty, to [`get_max_duty`](Self::get_max_duty) for 100% duty, both included.
+    pub fn get_duty(&self, channel: Channel) -> u16 {
+        self.inner.get_compare_value(channel)
+    }
+
     /// Set the output polarity for a given channel.
     pub fn set_polarity(&mut self, channel: Channel, polarity: OutputPolarity) {
         self.inner.set_output_polarity(channel, polarity);
     }
+
+    /// Generate a sequence of PWM waveform
+    ///
+    /// Note:  
+    /// you will need to provide corresponding TIMx_UP DMA channel to use this method.
+    pub async fn gen_waveform(
+        &mut self,
+        dma: impl Peripheral<P = impl super::UpDma<T>>,
+        channel: Channel,
+        duty: &[u16],
+    ) {
+        assert!(duty.iter().all(|v| *v <= self.get_max_duty()));
+
+        into_ref!(dma);
+
+        #[allow(clippy::let_unit_value)] // eg. stm32f334
+        let req = dma.request();
+
+        let original_duty_state = self.get_duty(channel);
+        let original_enable_state = self.is_enabled(channel);
+        let original_update_dma_state = self.inner.get_update_dma_state();
+
+        if !original_update_dma_state {
+            self.inner.enable_update_dma(true);
+        }
+
+        if !original_enable_state {
+            self.enable(channel);
+        }
+
+        unsafe {
+            #[cfg(not(any(bdma, gpdma)))]
+            use crate::dma::{Burst, FifoThreshold};
+            use crate::dma::{Transfer, TransferOptions};
+
+            let dma_transfer_option = TransferOptions {
+                #[cfg(not(any(bdma, gpdma)))]
+                fifo_threshold: Some(FifoThreshold::Full),
+                #[cfg(not(any(bdma, gpdma)))]
+                mburst: Burst::Incr8,
+                ..Default::default()
+            };
+
+            Transfer::new_write(
+                &mut dma,
+                req,
+                duty,
+                T::regs_gp16().ccr(channel.index()).as_ptr() as *mut _,
+                dma_transfer_option,
+            )
+            .await
+        };
+
+        // restore output compare state
+        if !original_enable_state {
+            self.disable(channel);
+        }
+
+        self.set_duty(channel, original_duty_state);
+
+        // Since DMA is closed before timer update event trigger DMA is turn off,
+        // this can almost always trigger a DMA FIFO error.
+        //
+        // optional TODO:
+        // clean FEIF after disable UDE
+        if !original_update_dma_state {
+            self.inner.enable_update_dma(false);
+        }
+    }
 }
 
 impl<'d, T: CaptureCompare16bitInstance> embedded_hal_02::Pwm for SimplePwm<'d, T> {
diff --git a/examples/stm32f4/src/bin/ws2812_pwm_dma.rs b/examples/stm32f4/src/bin/ws2812_pwm.rs
similarity index 51%
rename from examples/stm32f4/src/bin/ws2812_pwm_dma.rs
rename to examples/stm32f4/src/bin/ws2812_pwm.rs
index 4458b643f..239709253 100644
--- a/examples/stm32f4/src/bin/ws2812_pwm_dma.rs
+++ b/examples/stm32f4/src/bin/ws2812_pwm.rs
@@ -2,15 +2,9 @@
 // We assume the DIN pin of ws2812 connect to GPIO PB4, and ws2812 is properly powered.
 //
 // The idea is that the data rate of ws2812 is 800 kHz, and it use different duty ratio to represent bit 0 and bit 1.
-// Thus we can set TIM overflow at 800 kHz, and let TIM Update Event trigger a DMA transfer, then let DMA change CCR value,
-// such that pwm duty ratio meet the bit representation of ws2812.
+// Thus we can set TIM overflow at 800 kHz, and change duty ratio of TIM to meet the bit representation of ws2812.
 //
-// You may want to modify TIM CCR with Cortex core directly,
-// but according to my test, Cortex core will need to run far more than 100 MHz to catch up with TIM.
-// Thus we need to use a DMA.
-//
-// This demo is a combination of HAL, PAC, and manually invoke `dma::Transfer`.
-// If you need a simpler way to control ws2812, you may want to take a look at `ws2812_spi.rs` file, which make use of SPI.
+// you may also want to take a look at `ws2812_spi.rs` file, which make use of SPI instead.
 //
 // Warning:
 // DO NOT stare at ws2812 directy (especially after each MCU Reset), its (max) brightness could easily make your eyes feel burn.
@@ -20,7 +14,6 @@
 
 use embassy_executor::Spawner;
 use embassy_stm32::gpio::OutputType;
-use embassy_stm32::pac;
 use embassy_stm32::time::khz;
 use embassy_stm32::timer::simple_pwm::{PwmPin, SimplePwm};
 use embassy_stm32::timer::{Channel, CountingMode};
@@ -89,62 +82,20 @@ async fn main(_spawner: Spawner) {
 
     let pwm_channel = Channel::Ch1;
 
-    // PAC level hacking, enable output compare preload
-    // keep output waveform integrity
-    pac::TIM3
-        .ccmr_output(pwm_channel.index())
-        .modify(|v| v.set_ocpe(0, true));
-
     // make sure PWM output keep low on first start
     ws2812_pwm.set_duty(pwm_channel, 0);
 
-    {
-        use embassy_stm32::dma::{Burst, FifoThreshold, Transfer, TransferOptions};
+    // flip color at 2 Hz
+    let mut ticker = Ticker::every(Duration::from_millis(500));
 
-        // configure FIFO and MBURST of DMA, to minimize DMA occupation on AHB/APB
-        let mut dma_transfer_option = TransferOptions::default();
-        dma_transfer_option.fifo_threshold = Some(FifoThreshold::Full);
-        dma_transfer_option.mburst = Burst::Incr8;
-
-        // flip color at 2 Hz
-        let mut ticker = Ticker::every(Duration::from_millis(500));
-
-        loop {
-            for &color in color_list {
-                // start PWM output
-                ws2812_pwm.enable(pwm_channel);
-
-                // PAC level hacking, enable timer-update-event trigger DMA
-                pac::TIM3.dier().modify(|v| v.set_ude(true));
-
-                unsafe {
-                    Transfer::new_write(
-                        // with &mut, we can easily reuse same DMA channel multiple times
-                        &mut dp.DMA1_CH2,
-                        5,
-                        color,
-                        pac::TIM3.ccr(pwm_channel.index()).as_ptr() as *mut _,
-                        dma_transfer_option,
-                    )
-                    .await;
-
-                    // Turn off timer-update-event trigger DMA as soon as possible.
-                    // Then clean the FIFO Error Flag if set.
-                    pac::TIM3.dier().modify(|v| v.set_ude(false));
-                    if pac::DMA1.isr(0).read().feif(2) {
-                        pac::DMA1.ifcr(0).write(|v| v.set_feif(2, true));
-                    }
-
-                    // ws2812 need at least 50 us low level input to confirm the input data and change it's state
-                    Timer::after_micros(50).await;
-                }
-
-                // stop PWM output for saving some energy
-                ws2812_pwm.disable(pwm_channel);
-
-                // wait until ticker tick
-                ticker.next().await;
-            }
+    loop {
+        for &color in color_list {
+            // with &mut, we can easily reuse same DMA channel multiple times
+            ws2812_pwm.gen_waveform(&mut dp.DMA1_CH2, pwm_channel, color).await;
+            // ws2812 need at least 50 us low level input to confirm the input data and change it's state
+            Timer::after_micros(50).await;
+            // wait until ticker tick
+            ticker.next().await;
         }
     }
 }