diff --git a/embassy-stm32/src/dma/bdma.rs b/embassy-stm32/src/dma/bdma.rs
index 46670e1be..adb288eb0 100644
--- a/embassy-stm32/src/dma/bdma.rs
+++ b/embassy-stm32/src/dma/bdma.rs
@@ -89,7 +89,11 @@ pub(crate) unsafe fn do_transfer(
     ch.cr().write(|w| {
         w.set_psize(vals::Size::BITS8);
         w.set_msize(vals::Size::BITS8);
-        w.set_minc(vals::Inc::ENABLED);
+        if incr_mem {
+            w.set_minc(vals::Inc::ENABLED);
+        } else {
+            w.set_minc(vals::Inc::DISABLED);
+        }
         w.set_dir(dir);
         w.set_teie(true);
         w.set_tcie(true);
diff --git a/embassy-stm32/src/spi/mod.rs b/embassy-stm32/src/spi/mod.rs
index 9bb5a729c..046ec0fe7 100644
--- a/embassy-stm32/src/spi/mod.rs
+++ b/embassy-stm32/src/spi/mod.rs
@@ -1,8 +1,8 @@
 #![macro_use]
 
-#[cfg_attr(spi_v1, path = "v1.rs")]
+//#[cfg_attr(spi_v1, path = "v1.rs")]
 #[cfg_attr(spi_v2, path = "v2.rs")]
-#[cfg_attr(spi_v3, path = "v3.rs")]
+//#[cfg_attr(spi_v3, path = "v3.rs")]
 mod _version;
 use crate::{dma, peripherals, rcc::RccPeripheral};
 pub use _version::*;
diff --git a/embassy-stm32/src/spi/v2.rs b/embassy-stm32/src/spi/v2.rs
index 4e135e9df..400fd89af 100644
--- a/embassy-stm32/src/spi/v2.rs
+++ b/embassy-stm32/src/spi/v2.rs
@@ -1,16 +1,23 @@
 #![macro_use]
 
+use crate::dma::NoDma;
 use crate::gpio::{AnyPin, Pin};
 use crate::pac::gpio::vals::{Afr, Moder};
 use crate::pac::gpio::Gpio;
 use crate::pac::spi;
-use crate::spi::{ByteOrder, Config, Error, Instance, MisoPin, MosiPin, SckPin, WordSize};
+use crate::spi::{
+    ByteOrder, Config, Error, Instance, MisoPin, MosiPin, RxDmaChannel, SckPin, TxDmaChannel,
+    WordSize,
+};
 use crate::time::Hertz;
+use core::future::Future;
 use core::marker::PhantomData;
 use core::ptr;
 use embassy::util::Unborrow;
 use embassy_extras::unborrow;
+use embassy_traits::spi as traits;
 pub use embedded_hal::spi::{Mode, Phase, Polarity, MODE_0, MODE_1, MODE_2, MODE_3};
+use futures::future::join3;
 
 impl WordSize {
     fn ds(&self) -> spi::vals::Ds {
@@ -28,26 +35,30 @@ impl WordSize {
     }
 }
 
-pub struct Spi<'d, T: Instance> {
+pub struct Spi<'d, T: Instance, Tx, Rx> {
     sck: AnyPin,
     mosi: AnyPin,
     miso: AnyPin,
+    txdma: Tx,
+    rxdma: Rx,
     phantom: PhantomData<&'d mut T>,
 }
 
-impl<'d, T: Instance> Spi<'d, T> {
+impl<'d, T: Instance, Tx, Rx> Spi<'d, T, Tx, Rx> {
     pub fn new<F>(
         _peri: impl Unborrow<Target = T> + 'd,
         sck: impl Unborrow<Target = impl SckPin<T>>,
         mosi: impl Unborrow<Target = impl MosiPin<T>>,
         miso: impl Unborrow<Target = impl MisoPin<T>>,
+        txdma: impl Unborrow<Target = Tx>,
+        rxdma: impl Unborrow<Target = Rx>,
         freq: F,
         config: Config,
     ) -> Self
     where
         F: Into<Hertz>,
     {
-        unborrow!(sck, mosi, miso);
+        unborrow!(sck, mosi, miso, txdma, rxdma);
 
         unsafe {
             Self::configure_pin(sck.block(), sck.pin() as _, sck.af_num());
@@ -98,6 +109,8 @@ impl<'d, T: Instance> Spi<'d, T> {
             sck,
             mosi,
             miso,
+            txdma,
+            rxdma,
             phantom: PhantomData,
         }
     }
@@ -140,9 +153,156 @@ impl<'d, T: Instance> Spi<'d, T> {
             });
         }
     }
+
+    #[allow(unused)]
+    async fn write_dma_u8(&mut self, write: &[u8]) -> Result<(), Error>
+    where
+        Tx: TxDmaChannel<T>,
+    {
+        unsafe {
+            T::regs().cr1().modify(|w| {
+                w.set_spe(false);
+            });
+            T::regs().cr2().modify(|reg| {
+                reg.set_rxdmaen(true);
+            });
+        }
+        Self::set_word_size(WordSize::EightBit);
+
+        let request = self.txdma.request();
+        let dst = T::regs().dr().ptr() as *mut u8;
+        let f = self.txdma.write(request, write, dst);
+
+        unsafe {
+            T::regs().cr2().modify(|reg| {
+                reg.set_txdmaen(true);
+            });
+            T::regs().cr1().modify(|w| {
+                w.set_spe(true);
+            });
+        }
+
+        f.await;
+        Ok(())
+    }
+
+    #[allow(unused)]
+    async fn read_dma_u8(&mut self, read: &mut [u8]) -> Result<(), Error>
+    where
+        Tx: TxDmaChannel<T>,
+        Rx: RxDmaChannel<T>,
+    {
+        unsafe {
+            T::regs().cr1().modify(|w| {
+                w.set_spe(false);
+            });
+            T::regs().cr2().modify(|reg| {
+                reg.set_rxdmaen(true);
+            });
+        }
+        Self::set_word_size(WordSize::EightBit);
+
+        let clock_byte_count = read.len();
+
+        let rx_request = self.rxdma.request();
+        let rx_src = T::regs().dr().ptr() as *mut u8;
+        let rx_f = self.rxdma.read(rx_request, rx_src, read);
+
+        let tx_request = self.txdma.request();
+        let tx_dst = T::regs().dr().ptr() as *mut u8;
+        let clock_byte = 0x00;
+        let tx_f = self
+            .txdma
+            .write_x(tx_request, &clock_byte, clock_byte_count, tx_dst);
+
+        unsafe {
+            T::regs().cr2().modify(|reg| {
+                reg.set_txdmaen(true);
+            });
+            T::regs().cr1().modify(|w| {
+                w.set_spe(true);
+            });
+        }
+
+        join3(tx_f, rx_f, Self::wait_for_idle()).await;
+
+        unsafe {
+            T::regs().cr2().modify(|reg| {
+                reg.set_txdmaen(false);
+                reg.set_rxdmaen(false);
+            });
+            T::regs().cr1().modify(|w| {
+                w.set_spe(false);
+            });
+        }
+
+        Ok(())
+    }
+
+    #[allow(unused)]
+    async fn read_write_dma_u8(&mut self, read: &mut [u8], write: &[u8]) -> Result<(), Error>
+    where
+        Tx: TxDmaChannel<T>,
+        Rx: RxDmaChannel<T>,
+    {
+        unsafe {
+            T::regs().cr1().modify(|w| {
+                w.set_spe(false);
+            });
+            T::regs().cr2().modify(|reg| {
+                reg.set_rxdmaen(true);
+            });
+        }
+        Self::set_word_size(WordSize::EightBit);
+
+        let rx_request = self.rxdma.request();
+        let rx_src = T::regs().dr().ptr() as *mut u8;
+        let rx_f = self.rxdma.read(rx_request, rx_src, read);
+
+        let tx_request = self.txdma.request();
+        let tx_dst = T::regs().dr().ptr() as *mut u8;
+        let tx_f = self.txdma.write(tx_request, write, tx_dst);
+
+        unsafe {
+            T::regs().cr2().modify(|reg| {
+                reg.set_txdmaen(true);
+            });
+            T::regs().cr1().modify(|w| {
+                w.set_spe(true);
+            });
+        }
+
+        join3(tx_f, rx_f, Self::wait_for_idle()).await;
+
+        unsafe {
+            T::regs().cr2().modify(|reg| {
+                reg.set_txdmaen(false);
+                reg.set_rxdmaen(false);
+            });
+            T::regs().cr1().modify(|w| {
+                w.set_spe(false);
+            });
+        }
+
+        Ok(())
+    }
+
+    async fn wait_for_idle() {
+        unsafe {
+            while T::regs().sr().read().ftlvl() > 0 {
+                // spin
+            }
+            while T::regs().sr().read().frlvl() > 0 {
+                // spin
+            }
+            while T::regs().sr().read().bsy() {
+                // spin
+            }
+        }
+    }
 }
 
-impl<'d, T: Instance> Drop for Spi<'d, T> {
+impl<'d, T: Instance, Tx, Rx> Drop for Spi<'d, T, Tx, Rx> {
     fn drop(&mut self) {
         unsafe {
             Self::unconfigure_pin(self.sck.block(), self.sck.pin() as _);
@@ -200,7 +360,7 @@ fn read_word<W: Word>(regs: &'static crate::pac::spi::Spi) -> Result<W, Error> {
     }
 }
 
-impl<'d, T: Instance> embedded_hal::blocking::spi::Write<u8> for Spi<'d, T> {
+impl<'d, T: Instance, Rx> embedded_hal::blocking::spi::Write<u8> for Spi<'d, T, NoDma, Rx> {
     type Error = Error;
 
     fn write(&mut self, words: &[u8]) -> Result<(), Self::Error> {
@@ -216,7 +376,7 @@ impl<'d, T: Instance> embedded_hal::blocking::spi::Write<u8> for Spi<'d, T> {
     }
 }
 
-impl<'d, T: Instance> embedded_hal::blocking::spi::Transfer<u8> for Spi<'d, T> {
+impl<'d, T: Instance> embedded_hal::blocking::spi::Transfer<u8> for Spi<'d, T, NoDma, NoDma> {
     type Error = Error;
 
     fn transfer<'w>(&mut self, words: &'w mut [u8]) -> Result<&'w [u8], Self::Error> {
@@ -232,7 +392,7 @@ impl<'d, T: Instance> embedded_hal::blocking::spi::Transfer<u8> for Spi<'d, T> {
     }
 }
 
-impl<'d, T: Instance> embedded_hal::blocking::spi::Write<u16> for Spi<'d, T> {
+impl<'d, T: Instance, Rx> embedded_hal::blocking::spi::Write<u16> for Spi<'d, T, NoDma, Rx> {
     type Error = Error;
 
     fn write(&mut self, words: &[u16]) -> Result<(), Self::Error> {
@@ -248,7 +408,7 @@ impl<'d, T: Instance> embedded_hal::blocking::spi::Write<u16> for Spi<'d, T> {
     }
 }
 
-impl<'d, T: Instance> embedded_hal::blocking::spi::Transfer<u16> for Spi<'d, T> {
+impl<'d, T: Instance> embedded_hal::blocking::spi::Transfer<u16> for Spi<'d, T, NoDma, NoDma> {
     type Error = Error;
 
     fn transfer<'w>(&mut self, words: &'w mut [u16]) -> Result<&'w [u16], Self::Error> {
@@ -263,3 +423,42 @@ impl<'d, T: Instance> embedded_hal::blocking::spi::Transfer<u16> for Spi<'d, T>
         Ok(words)
     }
 }
+
+impl<'d, T: Instance, Tx, Rx> traits::Spi<u8> for Spi<'d, T, Tx, Rx> {
+    type Error = super::Error;
+}
+
+impl<'d, T: Instance, Tx: TxDmaChannel<T>, Rx> traits::Write<u8> for Spi<'d, T, Tx, Rx> {
+    #[rustfmt::skip]
+    type WriteFuture<'a> where Self: 'a = impl Future<Output = Result<(), Self::Error>> + 'a;
+
+    fn write<'a>(&'a mut self, data: &'a [u8]) -> Self::WriteFuture<'a> {
+        self.write_dma_u8(data)
+    }
+}
+
+impl<'d, T: Instance, Tx: TxDmaChannel<T>, Rx: RxDmaChannel<T>> traits::Read<u8>
+    for Spi<'d, T, Tx, Rx>
+{
+    #[rustfmt::skip]
+    type ReadFuture<'a> where Self: 'a = impl Future<Output = Result<(), Self::Error>> + 'a;
+
+    fn read<'a>(&'a mut self, data: &'a mut [u8]) -> Self::ReadFuture<'a> {
+        self.read_dma_u8(data)
+    }
+}
+
+impl<'d, T: Instance, Tx: TxDmaChannel<T>, Rx: RxDmaChannel<T>> traits::FullDuplex<u8>
+    for Spi<'d, T, Tx, Rx>
+{
+    #[rustfmt::skip]
+    type WriteReadFuture<'a> where Self: 'a = impl Future<Output = Result<(), Self::Error>> + 'a;
+
+    fn read_write<'a>(
+        &'a mut self,
+        read: &'a mut [u8],
+        write: &'a [u8],
+    ) -> Self::WriteReadFuture<'a> {
+        self.read_write_dma_u8(read, write)
+    }
+}
diff --git a/embassy-stm32/src/spi/v3.rs b/embassy-stm32/src/spi/v3.rs
index eb8df44ae..fb2a46f3e 100644
--- a/embassy-stm32/src/spi/v3.rs
+++ b/embassy-stm32/src/spi/v3.rs
@@ -201,7 +201,28 @@ impl<'d, T: Instance, Tx, Rx> Spi<'d, T, Tx, Rx> {
         Tx: TxDmaChannel<T>,
         Rx: RxDmaChannel<T>,
     {
-        unimplemented!()
+        let clock_byte_count = read.len();
+
+        let rx_request = self.rxdma.request();
+        let rx_src = T::regs().rxdr().ptr() as *mut u8;
+        let rx_f = self.rxdma.read(rx_request, rx_src, read);
+
+        let tx_request = self.txdma.request();
+        let tx_dst = T::regs().txdr().ptr() as *mut u8;
+        let clock_byte = 0x00;
+        let tx_f = self
+            .txdma
+            .write_x(tx_request, &clock_byte, clock_byte_count, tx_dst);
+
+        unsafe {
+            T::regs().cfg1().modify(|reg| {
+                reg.set_txdmaen(true);
+                reg.set_rxdmaen(true);
+            });
+        }
+
+        let r = join(tx_f, rx_f).await;
+        Ok(())
     }
 
     #[allow(unused)]
@@ -218,10 +239,7 @@ impl<'d, T: Instance, Tx, Rx> Spi<'d, T, Tx, Rx> {
 
         let tx_request = self.txdma.request();
         let tx_dst = T::regs().txdr().ptr() as *mut u8;
-        let clock_byte = 0x00;
-        let tx_f = self
-            .txdma
-            .write_x(tx_request, &clock_byte, clock_byte_count, tx_dst);
+        let tx_f = self.txdma.write(tx_request, write, tx_dst);
 
         unsafe {
             T::regs().cfg1().modify(|reg| {
diff --git a/examples/stm32l4/src/bin/spi.rs b/examples/stm32l4/src/bin/spi.rs
index 8702fe0cc..14605283b 100644
--- a/examples/stm32l4/src/bin/spi.rs
+++ b/examples/stm32l4/src/bin/spi.rs
@@ -17,6 +17,7 @@ use embassy_stm32::time::Hertz;
 use embedded_hal::blocking::spi::Transfer;
 use embedded_hal::digital::v2::OutputPin;
 use example_common::*;
+use embassy_stm32::dma::NoDma;
 
 #[entry]
 fn main() -> ! {
@@ -41,6 +42,8 @@ fn main() -> ! {
         p.PC10,
         p.PC12,
         p.PC11,
+        NoDma,
+        NoDma,
         Hertz(1_000_000),
         Config::default(),
     );
diff --git a/examples/stm32l4/src/bin/spi_dma.rs b/examples/stm32l4/src/bin/spi_dma.rs
new file mode 100644
index 000000000..ca77c2f9b
--- /dev/null
+++ b/examples/stm32l4/src/bin/spi_dma.rs
@@ -0,0 +1,103 @@
+#![no_std]
+#![no_main]
+#![feature(trait_alias)]
+#![feature(min_type_alias_impl_trait)]
+#![feature(impl_trait_in_bindings)]
+#![feature(type_alias_impl_trait)]
+#![allow(incomplete_features)]
+
+#[path = "../example_common.rs"]
+mod example_common;
+
+use cortex_m_rt::entry;
+use embassy::executor::Executor;
+use embassy::time::Clock;
+use embassy::util::Forever;
+use embassy_stm32::pac;
+use example_common::*;
+use embassy_stm32::spi::{Spi, Config};
+use embassy_traits::spi::FullDuplex;
+use embassy_stm32::time::Hertz;
+use embassy_stm32::gpio::{Output, Level, Speed};
+use embedded_hal::digital::v2::OutputPin;
+
+#[embassy::task]
+async fn main_task() {
+    let p = embassy_stm32::init(Default::default());
+
+    let mut spi = Spi::new(
+        p.SPI3,
+        p.PC10,
+        p.PC12,
+        p.PC11,
+        p.DMA1_CH0,
+        p.DMA1_CH1,
+        Hertz(1_000_000),
+        Config::default(),
+    );
+
+    let mut cs = Output::new(p.PE0, Level::High, Speed::VeryHigh);
+
+    loop {
+        let write = [0x0A; 10];
+        let mut read = [0; 10];
+        unwrap!(cs.set_low());
+        spi.read_write(&mut read, &write).await.ok();
+        unwrap!(cs.set_high());
+        info!("xfer {=[u8]:x}", read);
+    }
+}
+
+struct ZeroClock;
+
+impl Clock for ZeroClock {
+    fn now(&self) -> u64 {
+        0
+    }
+}
+
+static EXECUTOR: Forever<Executor> = Forever::new();
+
+#[entry]
+fn main() -> ! {
+    info!("Hello World!");
+
+    unsafe {
+        pac::DBGMCU.cr().modify(|w| {
+            w.set_dbg_sleep(true);
+            w.set_dbg_standby(true);
+            w.set_dbg_stop(true);
+        });
+
+        //pac::RCC.apbenr().modify(|w| {
+        //w.set_spi3en(true);
+        // });
+
+        pac::RCC.apb2enr().modify(|w| {
+            w.set_syscfgen(true);
+        });
+
+        pac::RCC.ahb1enr().modify(|w| {
+            w.set_dmamux1en(true);
+            w.set_dma1en(true);
+            w.set_dma2en(true);
+        });
+
+        pac::RCC.ahb2enr().modify(|w| {
+            w.set_gpioaen(true);
+            w.set_gpioben(true);
+            w.set_gpiocen(true);
+            w.set_gpioden(true);
+            w.set_gpioeen(true);
+            w.set_gpiofen(true);
+        });
+    }
+
+    unsafe { embassy::time::set_clock(&ZeroClock) };
+
+    let executor = EXECUTOR.put(Executor::new());
+
+    executor.run(|spawner| {
+        unwrap!(spawner.spawn(main_task()));
+    })
+}