diff --git a/embassy-executor/src/raw/mod.rs b/embassy-executor/src/raw/mod.rs
index 42bd82262..15ff18fc8 100644
--- a/embassy-executor/src/raw/mod.rs
+++ b/embassy-executor/src/raw/mod.rs
@@ -13,11 +13,12 @@ mod timer_queue;
 pub(crate) mod util;
 mod waker;
 
-use core::cell::Cell;
 use core::future::Future;
+use core::marker::PhantomData;
 use core::mem;
 use core::pin::Pin;
 use core::ptr::NonNull;
+use core::sync::atomic::AtomicPtr;
 use core::task::{Context, Poll};
 
 use atomic_polyfill::{AtomicU32, Ordering};
@@ -30,7 +31,7 @@ use embassy_time::Instant;
 use rtos_trace::trace;
 
 use self::run_queue::{RunQueue, RunQueueItem};
-use self::util::UninitCell;
+use self::util::{SyncUnsafeCell, UninitCell};
 pub use self::waker::task_from_waker;
 use super::SpawnToken;
 
@@ -46,11 +47,11 @@ pub(crate) const STATE_TIMER_QUEUED: u32 = 1 << 2;
 pub(crate) struct TaskHeader {
     pub(crate) state: AtomicU32,
     pub(crate) run_queue_item: RunQueueItem,
-    pub(crate) executor: Cell<Option<&'static Executor>>,
-    poll_fn: Cell<Option<unsafe fn(TaskRef)>>,
+    pub(crate) executor: SyncUnsafeCell<Option<&'static SyncExecutor>>,
+    poll_fn: SyncUnsafeCell<Option<unsafe fn(TaskRef)>>,
 
     #[cfg(feature = "integrated-timers")]
-    pub(crate) expires_at: Cell<Instant>,
+    pub(crate) expires_at: SyncUnsafeCell<Instant>,
     #[cfg(feature = "integrated-timers")]
     pub(crate) timer_queue_item: timer_queue::TimerQueueItem,
 }
@@ -61,6 +62,9 @@ pub struct TaskRef {
     ptr: NonNull<TaskHeader>,
 }
 
+unsafe impl Send for TaskRef where &'static TaskHeader: Send {}
+unsafe impl Sync for TaskRef where &'static TaskHeader: Sync {}
+
 impl TaskRef {
     fn new<F: Future + 'static>(task: &'static TaskStorage<F>) -> Self {
         Self {
@@ -115,12 +119,12 @@ impl<F: Future + 'static> TaskStorage<F> {
             raw: TaskHeader {
                 state: AtomicU32::new(0),
                 run_queue_item: RunQueueItem::new(),
-                executor: Cell::new(None),
+                executor: SyncUnsafeCell::new(None),
                 // Note: this is lazily initialized so that a static `TaskStorage` will go in `.bss`
-                poll_fn: Cell::new(None),
+                poll_fn: SyncUnsafeCell::new(None),
 
                 #[cfg(feature = "integrated-timers")]
-                expires_at: Cell::new(Instant::from_ticks(0)),
+                expires_at: SyncUnsafeCell::new(Instant::from_ticks(0)),
                 #[cfg(feature = "integrated-timers")]
                 timer_queue_item: timer_queue::TimerQueueItem::new(),
             },
@@ -170,9 +174,15 @@ impl<F: Future + 'static> TaskStorage<F> {
         // it's a noop for our waker.
         mem::forget(waker);
     }
-}
 
-unsafe impl<F: Future + 'static> Sync for TaskStorage<F> {}
+    #[doc(hidden)]
+    #[allow(dead_code)]
+    fn _assert_sync(self) {
+        fn assert_sync<T: Sync>(_: T) {}
+
+        assert_sync(self)
+    }
+}
 
 struct AvailableTask<F: Future + 'static> {
     task: &'static TaskStorage<F>,
@@ -279,29 +289,10 @@ impl<F: Future + 'static, const N: usize> TaskPool<F, N> {
     }
 }
 
-/// Raw executor.
-///
-/// This is the core of the Embassy executor. It is low-level, requiring manual
-/// handling of wakeups and task polling. If you can, prefer using one of the
-/// [higher level executors](crate::Executor).
-///
-/// The raw executor leaves it up to you to handle wakeups and scheduling:
-///
-/// - To get the executor to do work, call `poll()`. This will poll all queued tasks (all tasks
-///   that "want to run").
-/// - You must supply a `signal_fn`. The executor will call it to notify you it has work
-///   to do. You must arrange for `poll()` to be called as soon as possible.
-///
-/// `signal_fn` can be called from *any* context: any thread, any interrupt priority
-/// level, etc. It may be called synchronously from any `Executor` method call as well.
-/// You must deal with this correctly.
-///
-/// In particular, you must NOT call `poll` directly from `signal_fn`, as this violates
-/// the requirement for `poll` to not be called reentrantly.
-pub struct Executor {
+pub(crate) struct SyncExecutor {
     run_queue: RunQueue,
     signal_fn: fn(*mut ()),
-    signal_ctx: *mut (),
+    signal_ctx: AtomicPtr<()>,
 
     #[cfg(feature = "integrated-timers")]
     pub(crate) timer_queue: timer_queue::TimerQueue,
@@ -309,14 +300,8 @@ pub struct Executor {
     alarm: AlarmHandle,
 }
 
-impl Executor {
-    /// Create a new executor.
-    ///
-    /// When the executor has work to do, it will call `signal_fn` with
-    /// `signal_ctx` as argument.
-    ///
-    /// See [`Executor`] docs for details on `signal_fn`.
-    pub fn new(signal_fn: fn(*mut ()), signal_ctx: *mut ()) -> Self {
+impl SyncExecutor {
+    pub(crate) fn new(signal_fn: fn(*mut ()), signal_ctx: *mut ()) -> Self {
         #[cfg(feature = "integrated-timers")]
         let alarm = unsafe { unwrap!(driver::allocate_alarm()) };
         #[cfg(feature = "integrated-timers")]
@@ -325,7 +310,7 @@ impl Executor {
         Self {
             run_queue: RunQueue::new(),
             signal_fn,
-            signal_ctx,
+            signal_ctx: AtomicPtr::new(signal_ctx),
 
             #[cfg(feature = "integrated-timers")]
             timer_queue: timer_queue::TimerQueue::new(),
@@ -346,19 +331,10 @@ impl Executor {
         trace::task_ready_begin(task.as_ptr() as u32);
 
         if self.run_queue.enqueue(cs, task) {
-            (self.signal_fn)(self.signal_ctx)
+            (self.signal_fn)(self.signal_ctx.load(Ordering::Relaxed))
         }
     }
 
-    /// Spawn a task in this executor.
-    ///
-    /// # Safety
-    ///
-    /// `task` must be a valid pointer to an initialized but not-already-spawned task.
-    ///
-    /// It is OK to use `unsafe` to call this from a thread that's not the executor thread.
-    /// In this case, the task's Future must be Send. This is because this is effectively
-    /// sending the task to the executor thread.
     pub(super) unsafe fn spawn(&'static self, task: TaskRef) {
         task.header().executor.set(Some(self));
 
@@ -370,24 +346,11 @@ impl Executor {
         })
     }
 
-    /// Poll all queued tasks in this executor.
-    ///
-    /// This loops over all tasks that are queued to be polled (i.e. they're
-    /// freshly spawned or they've been woken). Other tasks are not polled.
-    ///
-    /// You must call `poll` after receiving a call to `signal_fn`. It is OK
-    /// to call `poll` even when not requested by `signal_fn`, but it wastes
-    /// energy.
-    ///
     /// # Safety
     ///
-    /// You must NOT call `poll` reentrantly on the same executor.
-    ///
-    /// In particular, note that `poll` may call `signal_fn` synchronously. Therefore, you
-    /// must NOT directly call `poll()` from your `signal_fn`. Instead, `signal_fn` has to
-    /// somehow schedule for `poll()` to be called later, at a time you know for sure there's
-    /// no `poll()` already running.
-    pub unsafe fn poll(&'static self) {
+    /// Same as [`Executor::poll`], plus you must only call this on the thread this executor was created.
+    pub(crate) unsafe fn poll(&'static self) {
+        #[allow(clippy::never_loop)]
         loop {
             #[cfg(feature = "integrated-timers")]
             self.timer_queue.dequeue_expired(Instant::now(), |task| wake_task(task));
@@ -441,6 +404,84 @@ impl Executor {
         #[cfg(feature = "rtos-trace")]
         trace::system_idle();
     }
+}
+
+/// Raw executor.
+///
+/// This is the core of the Embassy executor. It is low-level, requiring manual
+/// handling of wakeups and task polling. If you can, prefer using one of the
+/// [higher level executors](crate::Executor).
+///
+/// The raw executor leaves it up to you to handle wakeups and scheduling:
+///
+/// - To get the executor to do work, call `poll()`. This will poll all queued tasks (all tasks
+///   that "want to run").
+/// - You must supply a `signal_fn`. The executor will call it to notify you it has work
+///   to do. You must arrange for `poll()` to be called as soon as possible.
+///
+/// `signal_fn` can be called from *any* context: any thread, any interrupt priority
+/// level, etc. It may be called synchronously from any `Executor` method call as well.
+/// You must deal with this correctly.
+///
+/// In particular, you must NOT call `poll` directly from `signal_fn`, as this violates
+/// the requirement for `poll` to not be called reentrantly.
+#[repr(transparent)]
+pub struct Executor {
+    pub(crate) inner: SyncExecutor,
+
+    _not_sync: PhantomData<*mut ()>,
+}
+
+impl Executor {
+    pub(crate) unsafe fn wrap(inner: &SyncExecutor) -> &Self {
+        mem::transmute(inner)
+    }
+    /// Create a new executor.
+    ///
+    /// When the executor has work to do, it will call `signal_fn` with
+    /// `signal_ctx` as argument.
+    ///
+    /// See [`Executor`] docs for details on `signal_fn`.
+    pub fn new(signal_fn: fn(*mut ()), signal_ctx: *mut ()) -> Self {
+        Self {
+            inner: SyncExecutor::new(signal_fn, signal_ctx),
+            _not_sync: PhantomData,
+        }
+    }
+
+    /// Spawn a task in this executor.
+    ///
+    /// # Safety
+    ///
+    /// `task` must be a valid pointer to an initialized but not-already-spawned task.
+    ///
+    /// It is OK to use `unsafe` to call this from a thread that's not the executor thread.
+    /// In this case, the task's Future must be Send. This is because this is effectively
+    /// sending the task to the executor thread.
+    pub(super) unsafe fn spawn(&'static self, task: TaskRef) {
+        self.inner.spawn(task)
+    }
+
+    /// Poll all queued tasks in this executor.
+    ///
+    /// This loops over all tasks that are queued to be polled (i.e. they're
+    /// freshly spawned or they've been woken). Other tasks are not polled.
+    ///
+    /// You must call `poll` after receiving a call to `signal_fn`. It is OK
+    /// to call `poll` even when not requested by `signal_fn`, but it wastes
+    /// energy.
+    ///
+    /// # Safety
+    ///
+    /// You must NOT call `poll` reentrantly on the same executor.
+    ///
+    /// In particular, note that `poll` may call `signal_fn` synchronously. Therefore, you
+    /// must NOT directly call `poll()` from your `signal_fn`. Instead, `signal_fn` has to
+    /// somehow schedule for `poll()` to be called later, at a time you know for sure there's
+    /// no `poll()` already running.
+    pub unsafe fn poll(&'static self) {
+        self.inner.poll()
+    }
 
     /// Get a spawner that spawns tasks in this executor.
     ///
@@ -483,8 +524,10 @@ impl embassy_time::queue::TimerQueue for TimerQueue {
     fn schedule_wake(&'static self, at: Instant, waker: &core::task::Waker) {
         let task = waker::task_from_waker(waker);
         let task = task.header();
-        let expires_at = task.expires_at.get();
-        task.expires_at.set(expires_at.min(at));
+        unsafe {
+            let expires_at = task.expires_at.get();
+            task.expires_at.set(expires_at.min(at));
+        }
     }
 }
 
diff --git a/embassy-executor/src/raw/timer_queue.rs b/embassy-executor/src/raw/timer_queue.rs
index 57d6d3cda..dc71c95b1 100644
--- a/embassy-executor/src/raw/timer_queue.rs
+++ b/embassy-executor/src/raw/timer_queue.rs
@@ -1,28 +1,32 @@
-use core::cell::Cell;
 use core::cmp::min;
 
 use atomic_polyfill::Ordering;
 use embassy_time::Instant;
 
 use super::{TaskRef, STATE_TIMER_QUEUED};
+use crate::raw::util::SyncUnsafeCell;
 
 pub(crate) struct TimerQueueItem {
-    next: Cell<Option<TaskRef>>,
+    next: SyncUnsafeCell<Option<TaskRef>>,
 }
 
 impl TimerQueueItem {
     pub const fn new() -> Self {
-        Self { next: Cell::new(None) }
+        Self {
+            next: SyncUnsafeCell::new(None),
+        }
     }
 }
 
 pub(crate) struct TimerQueue {
-    head: Cell<Option<TaskRef>>,
+    head: SyncUnsafeCell<Option<TaskRef>>,
 }
 
 impl TimerQueue {
     pub const fn new() -> Self {
-        Self { head: Cell::new(None) }
+        Self {
+            head: SyncUnsafeCell::new(None),
+        }
     }
 
     pub(crate) unsafe fn update(&self, p: TaskRef) {
diff --git a/embassy-executor/src/raw/util.rs b/embassy-executor/src/raw/util.rs
index 2b1f6b6f3..e2e8f4df8 100644
--- a/embassy-executor/src/raw/util.rs
+++ b/embassy-executor/src/raw/util.rs
@@ -25,3 +25,32 @@ impl<T> UninitCell<T> {
         ptr::drop_in_place(self.as_mut_ptr())
     }
 }
+
+unsafe impl<T> Sync for UninitCell<T> {}
+
+#[repr(transparent)]
+pub struct SyncUnsafeCell<T> {
+    value: UnsafeCell<T>,
+}
+
+unsafe impl<T: Sync> Sync for SyncUnsafeCell<T> {}
+
+impl<T> SyncUnsafeCell<T> {
+    #[inline]
+    pub const fn new(value: T) -> Self {
+        Self {
+            value: UnsafeCell::new(value),
+        }
+    }
+
+    pub unsafe fn set(&self, value: T) {
+        *self.value.get() = value;
+    }
+
+    pub unsafe fn get(&self) -> T
+    where
+        T: Copy,
+    {
+        *self.value.get()
+    }
+}
diff --git a/embassy-executor/src/spawner.rs b/embassy-executor/src/spawner.rs
index 7c0a0183c..2b6224045 100644
--- a/embassy-executor/src/spawner.rs
+++ b/embassy-executor/src/spawner.rs
@@ -92,6 +92,7 @@ impl Spawner {
         poll_fn(|cx| {
             let task = raw::task_from_waker(cx.waker());
             let executor = unsafe { task.header().executor.get().unwrap_unchecked() };
+            let executor = unsafe { raw::Executor::wrap(executor) };
             Poll::Ready(Self::new(executor))
         })
         .await
@@ -130,9 +131,7 @@ impl Spawner {
     /// spawner to other threads, but the spawner loses the ability to spawn
     /// non-Send tasks.
     pub fn make_send(&self) -> SendSpawner {
-        SendSpawner {
-            executor: self.executor,
-        }
+        SendSpawner::new(&self.executor.inner)
     }
 }
 
@@ -145,14 +144,11 @@ impl Spawner {
 /// If you want to spawn non-Send tasks, use [Spawner].
 #[derive(Copy, Clone)]
 pub struct SendSpawner {
-    executor: &'static raw::Executor,
+    executor: &'static raw::SyncExecutor,
 }
 
-unsafe impl Send for SendSpawner {}
-unsafe impl Sync for SendSpawner {}
-
 impl SendSpawner {
-    pub(crate) fn new(executor: &'static raw::Executor) -> Self {
+    pub(crate) fn new(executor: &'static raw::SyncExecutor) -> Self {
         Self { executor }
     }
 
diff --git a/embassy-rp/src/dma.rs b/embassy-rp/src/dma.rs
index 05adcecdd..ba07a88df 100644
--- a/embassy-rp/src/dma.rs
+++ b/embassy-rp/src/dma.rs
@@ -1,3 +1,4 @@
+//! Direct Memory Access (DMA)
 use core::future::Future;
 use core::pin::Pin;
 use core::sync::atomic::{compiler_fence, Ordering};
diff --git a/embassy-rp/src/spi.rs b/embassy-rp/src/spi.rs
index 584370d56..ebd621ecf 100644
--- a/embassy-rp/src/spi.rs
+++ b/embassy-rp/src/spi.rs
@@ -1,3 +1,4 @@
+//! Serial Peripheral Interface
 use core::marker::PhantomData;
 
 use embassy_embedded_hal::SetConfig;
@@ -383,21 +384,33 @@ impl<'d, T: Instance> Spi<'d, T, Async> {
     }
 
     async fn transfer_inner(&mut self, rx_ptr: *mut [u8], tx_ptr: *const [u8]) -> Result<(), Error> {
-        let (_, from_len) = crate::dma::slice_ptr_parts(tx_ptr);
-        let (_, to_len) = crate::dma::slice_ptr_parts_mut(rx_ptr);
-        assert_eq!(from_len, to_len);
+        let (_, tx_len) = crate::dma::slice_ptr_parts(tx_ptr);
+        let (_, rx_len) = crate::dma::slice_ptr_parts_mut(rx_ptr);
+
         unsafe {
             self.inner.regs().dmacr().write(|reg| {
                 reg.set_rxdmae(true);
                 reg.set_txdmae(true);
             })
         };
-        let tx_ch = self.tx_dma.as_mut().unwrap();
-        let tx_transfer = unsafe {
-            // If we don't assign future to a variable, the data register pointer
-            // is held across an await and makes the future non-Send.
-            crate::dma::write(tx_ch, tx_ptr, self.inner.regs().dr().ptr() as *mut _, T::TX_DREQ)
+
+        let mut tx_ch = self.tx_dma.as_mut().unwrap();
+        // If we don't assign future to a variable, the data register pointer
+        // is held across an await and makes the future non-Send.
+        let tx_transfer = async {
+            let p = self.inner.regs();
+            unsafe {
+                crate::dma::write(&mut tx_ch, tx_ptr, p.dr().ptr() as *mut _, T::TX_DREQ).await;
+
+                if rx_len > tx_len {
+                    let write_bytes_len = rx_len - tx_len;
+                    // write dummy data
+                    // this will disable incrementation of the buffers
+                    crate::dma::write_repeated(tx_ch, p.dr().ptr() as *mut u8, write_bytes_len, T::TX_DREQ).await
+                }
+            }
         };
+
         let rx_ch = self.rx_dma.as_mut().unwrap();
         let rx_transfer = unsafe {
             // If we don't assign future to a variable, the data register pointer
@@ -405,6 +418,22 @@ impl<'d, T: Instance> Spi<'d, T, Async> {
             crate::dma::read(rx_ch, self.inner.regs().dr().ptr() as *const _, rx_ptr, T::RX_DREQ)
         };
         join(tx_transfer, rx_transfer).await;
+
+        // if tx > rx we should clear any overflow of the FIFO SPI buffer
+        if tx_len > rx_len {
+            let p = self.inner.regs();
+            unsafe {
+                while p.sr().read().bsy() {}
+
+                // clear RX FIFO contents to prevent stale reads
+                while p.sr().read().rne() {
+                    let _: u16 = p.dr().read().data();
+                }
+                // clear RX overrun interrupt
+                p.icr().write(|w| w.set_roric(true));
+            }
+        }
+
         Ok(())
     }
 }
diff --git a/embassy-rp/src/uart/buffered.rs b/embassy-rp/src/uart/buffered.rs
index 32e5ddf14..1a573b311 100644
--- a/embassy-rp/src/uart/buffered.rs
+++ b/embassy-rp/src/uart/buffered.rs
@@ -124,7 +124,7 @@ impl<'d, T: Instance> BufferedUart<'d, T> {
         }
     }
 
-    pub fn blocking_write(&mut self, buffer: &[u8]) -> Result<(), Error> {
+    pub fn blocking_write(&mut self, buffer: &[u8]) -> Result<usize, Error> {
         self.tx.blocking_write(buffer)
     }
 
@@ -132,7 +132,7 @@ impl<'d, T: Instance> BufferedUart<'d, T> {
         self.tx.blocking_flush()
     }
 
-    pub fn blocking_read(&mut self, buffer: &mut [u8]) -> Result<(), Error> {
+    pub fn blocking_read(&mut self, buffer: &mut [u8]) -> Result<usize, Error> {
         self.rx.blocking_read(buffer)
     }
 
@@ -201,7 +201,7 @@ impl<'d, T: Instance> BufferedUartRx<'d, T> {
         })
     }
 
-    pub fn blocking_read(&mut self, buf: &mut [u8]) -> Result<(), Error> {
+    pub fn blocking_read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
         loop {
             let state = T::state();
             let mut rx_reader = unsafe { state.rx_buf.reader() };
@@ -222,7 +222,7 @@ impl<'d, T: Instance> BufferedUartRx<'d, T> {
                     });
                 }
 
-                return Ok(());
+                return Ok(n);
             }
         }
     }
@@ -326,7 +326,7 @@ impl<'d, T: Instance> BufferedUartTx<'d, T> {
         })
     }
 
-    pub fn blocking_write(&mut self, buf: &[u8]) -> Result<(), Error> {
+    pub fn blocking_write(&mut self, buf: &[u8]) -> Result<usize, Error> {
         loop {
             let state = T::state();
             let mut tx_writer = unsafe { state.tx_buf.writer() };
@@ -342,7 +342,7 @@ impl<'d, T: Instance> BufferedUartTx<'d, T> {
                 // FIFO was empty we have to manually pend the interrupt to shovel
                 // TX data from the buffer into the FIFO.
                 unsafe { T::Interrupt::steal() }.pend();
-                return Ok(());
+                return Ok(n);
             }
         }
     }
@@ -533,6 +533,38 @@ impl<'d, T: Instance + 'd> embedded_io::asynch::Write for BufferedUartTx<'d, T>
     }
 }
 
+impl<'d, T: Instance + 'd> embedded_io::blocking::Read for BufferedUart<'d, T> {
+    fn read(&mut self, buf: &mut [u8]) -> Result<usize, Self::Error> {
+        self.rx.blocking_read(buf)
+    }
+}
+
+impl<'d, T: Instance + 'd> embedded_io::blocking::Read for BufferedUartRx<'d, T> {
+    fn read(&mut self, buf: &mut [u8]) -> Result<usize, Self::Error> {
+        self.blocking_read(buf)
+    }
+}
+
+impl<'d, T: Instance + 'd> embedded_io::blocking::Write for BufferedUart<'d, T> {
+    fn write(&mut self, buf: &[u8]) -> Result<usize, Self::Error> {
+        self.tx.blocking_write(buf)
+    }
+
+    fn flush(&mut self) -> Result<(), Self::Error> {
+        self.tx.blocking_flush()
+    }
+}
+
+impl<'d, T: Instance + 'd> embedded_io::blocking::Write for BufferedUartTx<'d, T> {
+    fn write(&mut self, buf: &[u8]) -> Result<usize, Self::Error> {
+        self.blocking_write(buf)
+    }
+
+    fn flush(&mut self) -> Result<(), Self::Error> {
+        self.blocking_flush()
+    }
+}
+
 mod eh02 {
     use super::*;
 
@@ -566,8 +598,15 @@ mod eh02 {
     impl<'d, T: Instance> embedded_hal_02::blocking::serial::Write<u8> for BufferedUartTx<'d, T> {
         type Error = Error;
 
-        fn bwrite_all(&mut self, buffer: &[u8]) -> Result<(), Self::Error> {
-            self.blocking_write(buffer)
+        fn bwrite_all(&mut self, mut buffer: &[u8]) -> Result<(), Self::Error> {
+            while !buffer.is_empty() {
+                match self.blocking_write(buffer) {
+                    Ok(0) => panic!("zero-length write."),
+                    Ok(n) => buffer = &buffer[n..],
+                    Err(e) => return Err(e),
+                }
+            }
+            Ok(())
         }
 
         fn bflush(&mut self) -> Result<(), Self::Error> {
@@ -586,8 +625,15 @@ mod eh02 {
     impl<'d, T: Instance> embedded_hal_02::blocking::serial::Write<u8> for BufferedUart<'d, T> {
         type Error = Error;
 
-        fn bwrite_all(&mut self, buffer: &[u8]) -> Result<(), Self::Error> {
-            self.blocking_write(buffer)
+        fn bwrite_all(&mut self, mut buffer: &[u8]) -> Result<(), Self::Error> {
+            while !buffer.is_empty() {
+                match self.blocking_write(buffer) {
+                    Ok(0) => panic!("zero-length write."),
+                    Ok(n) => buffer = &buffer[n..],
+                    Err(e) => return Err(e),
+                }
+            }
+            Ok(())
         }
 
         fn bflush(&mut self) -> Result<(), Self::Error> {
@@ -620,7 +666,7 @@ mod eh1 {
 
     impl<'d, T: Instance> embedded_hal_1::serial::Write for BufferedUartTx<'d, T> {
         fn write(&mut self, buffer: &[u8]) -> Result<(), Self::Error> {
-            self.blocking_write(buffer)
+            self.blocking_write(buffer).map(drop)
         }
 
         fn flush(&mut self) -> Result<(), Self::Error> {
@@ -630,7 +676,7 @@ mod eh1 {
 
     impl<'d, T: Instance> embedded_hal_nb::serial::Write for BufferedUartTx<'d, T> {
         fn write(&mut self, char: u8) -> nb::Result<(), Self::Error> {
-            self.blocking_write(&[char]).map_err(nb::Error::Other)
+            self.blocking_write(&[char]).map(drop).map_err(nb::Error::Other)
         }
 
         fn flush(&mut self) -> nb::Result<(), Self::Error> {
@@ -646,7 +692,7 @@ mod eh1 {
 
     impl<'d, T: Instance> embedded_hal_1::serial::Write for BufferedUart<'d, T> {
         fn write(&mut self, buffer: &[u8]) -> Result<(), Self::Error> {
-            self.blocking_write(buffer)
+            self.blocking_write(buffer).map(drop)
         }
 
         fn flush(&mut self) -> Result<(), Self::Error> {
@@ -656,7 +702,7 @@ mod eh1 {
 
     impl<'d, T: Instance> embedded_hal_nb::serial::Write for BufferedUart<'d, T> {
         fn write(&mut self, char: u8) -> nb::Result<(), Self::Error> {
-            self.blocking_write(&[char]).map_err(nb::Error::Other)
+            self.blocking_write(&[char]).map(drop).map_err(nb::Error::Other)
         }
 
         fn flush(&mut self) -> nb::Result<(), Self::Error> {
diff --git a/embassy-stm32/Cargo.toml b/embassy-stm32/Cargo.toml
index b66d724d5..14ec3d70a 100644
--- a/embassy-stm32/Cargo.toml
+++ b/embassy-stm32/Cargo.toml
@@ -60,7 +60,7 @@ sdio-host = "0.5.0"
 embedded-sdmmc = { git = "https://github.com/embassy-rs/embedded-sdmmc-rs", rev = "46d1b1c2ff13e31e282ec1e352421721694f126a", optional = true }
 critical-section = "1.1"
 atomic-polyfill = "1.0.1"
-stm32-metapac = { version = "1", features = ["rt"] }
+stm32-metapac = { version = "2", features = ["rt"] }
 vcell = "0.1.3"
 bxcan = "0.7.0"
 nb = "1.0.0"
@@ -72,7 +72,7 @@ embedded-io = { version = "0.4.0", features = ["async"], optional = true }
 [build-dependencies]
 proc-macro2 = "1.0.36"
 quote = "1.0.15"
-stm32-metapac = { version = "1", default-features = false, features = ["metadata"]}
+stm32-metapac = { version = "2", default-features = false, features = ["metadata"]}
 
 [features]
 defmt = ["dep:defmt", "bxcan/unstable-defmt", "embassy-sync/defmt", "embassy-executor/defmt", "embassy-embedded-hal/defmt", "embassy-hal-common/defmt", "embedded-io?/defmt", "embassy-usb-driver?/defmt", "embassy-net-driver/defmt"]
diff --git a/embassy-stm32/build.rs b/embassy-stm32/build.rs
index dbfc1370d..3780c5a40 100644
--- a/embassy-stm32/build.rs
+++ b/embassy-stm32/build.rs
@@ -427,6 +427,12 @@ fn main() {
         (("sdmmc", "D6"), quote!(crate::sdmmc::D6Pin)),
         (("sdmmc", "D6"), quote!(crate::sdmmc::D7Pin)),
         (("sdmmc", "D8"), quote!(crate::sdmmc::D8Pin)),
+        (("quadspi", "BK1_IO0"), quote!(crate::qspi::D0Pin)),
+        (("quadspi", "BK1_IO1"), quote!(crate::qspi::D1Pin)),
+        (("quadspi", "BK1_IO2"), quote!(crate::qspi::D2Pin)),
+        (("quadspi", "BK1_IO3"), quote!(crate::qspi::D3Pin)),
+        (("quadspi", "CLK"), quote!(crate::qspi::SckPin)),
+        (("quadspi", "BK1_NCS"), quote!(crate::qspi::NSSPin)),
     ].into();
 
     for p in METADATA.peripherals {
@@ -507,6 +513,7 @@ fn main() {
         (("dcmi", "PSSI"), quote!(crate::dcmi::FrameDma)),
         // SDMMCv1 uses the same channel for both directions, so just implement for RX
         (("sdmmc", "RX"), quote!(crate::sdmmc::SdmmcDma)),
+        (("quadspi", "QUADSPI"), quote!(crate::qspi::QuadDma)),
     ]
     .into();
 
diff --git a/embassy-stm32/src/adc/sample_time.rs b/embassy-stm32/src/adc/sample_time.rs
index 60ba80048..bc5fb1d6f 100644
--- a/embassy-stm32/src/adc/sample_time.rs
+++ b/embassy-stm32/src/adc/sample_time.rs
@@ -1,5 +1,5 @@
 macro_rules! impl_sample_time {
-    ($default_doc:expr, $default:ident, $pac:ty, ($(($doc:expr, $variant:ident, $pac_variant:ident)),*)) => {
+    ($default_doc:expr, $default:ident, ($(($doc:expr, $variant:ident, $pac_variant:ident)),*)) => {
         #[doc = concat!("ADC sample time\n\nThe default setting is ", $default_doc, " ADC clock cycles.")]
         #[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd)]
         pub enum SampleTime {
@@ -9,10 +9,10 @@ macro_rules! impl_sample_time {
             )*
         }
 
-        impl From<SampleTime> for $pac {
-            fn from(sample_time: SampleTime) -> $pac {
+        impl From<SampleTime> for crate::pac::adc::vals::SampleTime {
+            fn from(sample_time: SampleTime) -> crate::pac::adc::vals::SampleTime {
                 match sample_time {
-                    $(SampleTime::$variant => <$pac>::$pac_variant),*
+                    $(SampleTime::$variant => crate::pac::adc::vals::SampleTime::$pac_variant),*
                 }
             }
         }
@@ -29,7 +29,6 @@ macro_rules! impl_sample_time {
 impl_sample_time!(
     "1.5",
     Cycles1_5,
-    crate::pac::adc::vals::SampleTime,
     (
         ("1.5", Cycles1_5, CYCLES1_5),
         ("7.5", Cycles7_5, CYCLES7_5),
@@ -46,7 +45,6 @@ impl_sample_time!(
 impl_sample_time!(
     "3",
     Cycles3,
-    crate::pac::adc::vals::Smp,
     (
         ("3", Cycles3, CYCLES3),
         ("15", Cycles15, CYCLES15),
@@ -63,7 +61,6 @@ impl_sample_time!(
 impl_sample_time!(
     "2.5",
     Cycles2_5,
-    crate::pac::adc::vals::SampleTime,
     (
         ("2.5", Cycles2_5, CYCLES2_5),
         ("6.5", Cycles6_5, CYCLES6_5),
@@ -80,7 +77,6 @@ impl_sample_time!(
 impl_sample_time!(
     "1.5",
     Cycles1_5,
-    crate::pac::adc::vals::SampleTime,
     (
         ("1.5", Cycles1_5, CYCLES1_5),
         ("3.5", Cycles3_5, CYCLES3_5),
@@ -97,7 +93,6 @@ impl_sample_time!(
 impl_sample_time!(
     "1.5",
     Cycles1_5,
-    crate::pac::adc::vals::Smp,
     (
         ("1.5", Cycles1_5, CYCLES1_5),
         ("2.5", Cycles2_5, CYCLES2_5),
diff --git a/embassy-stm32/src/lib.rs b/embassy-stm32/src/lib.rs
index eeaa04f67..8dc4df2dc 100644
--- a/embassy-stm32/src/lib.rs
+++ b/embassy-stm32/src/lib.rs
@@ -48,6 +48,8 @@ pub mod crc;
 ))]
 pub mod flash;
 pub mod pwm;
+#[cfg(quadspi)]
+pub mod qspi;
 #[cfg(rng)]
 pub mod rng;
 #[cfg(sdmmc)]
@@ -60,7 +62,6 @@ pub mod usart;
 pub mod usb;
 #[cfg(otg)]
 pub mod usb_otg;
-
 #[cfg(iwdg)]
 pub mod wdg;
 
diff --git a/embassy-stm32/src/qspi/enums.rs b/embassy-stm32/src/qspi/enums.rs
new file mode 100644
index 000000000..2dbe2b061
--- /dev/null
+++ b/embassy-stm32/src/qspi/enums.rs
@@ -0,0 +1,294 @@
+#[allow(dead_code)]
+#[derive(Copy, Clone)]
+pub(crate) enum QspiMode {
+    IndirectWrite,
+    IndirectRead,
+    AutoPolling,
+    MemoryMapped,
+}
+
+impl Into<u8> for QspiMode {
+    fn into(self) -> u8 {
+        match self {
+            QspiMode::IndirectWrite => 0b00,
+            QspiMode::IndirectRead => 0b01,
+            QspiMode::AutoPolling => 0b10,
+            QspiMode::MemoryMapped => 0b11,
+        }
+    }
+}
+
+#[allow(dead_code)]
+#[derive(Copy, Clone)]
+pub enum QspiWidth {
+    NONE,
+    SING,
+    DUAL,
+    QUAD,
+}
+
+impl Into<u8> for QspiWidth {
+    fn into(self) -> u8 {
+        match self {
+            QspiWidth::NONE => 0b00,
+            QspiWidth::SING => 0b01,
+            QspiWidth::DUAL => 0b10,
+            QspiWidth::QUAD => 0b11,
+        }
+    }
+}
+
+#[derive(Copy, Clone)]
+pub enum MemorySize {
+    _1KiB,
+    _2KiB,
+    _4KiB,
+    _8KiB,
+    _16KiB,
+    _32KiB,
+    _64KiB,
+    _128KiB,
+    _256KiB,
+    _512KiB,
+    _1MiB,
+    _2MiB,
+    _4MiB,
+    _8MiB,
+    _16MiB,
+    _32MiB,
+    _64MiB,
+    _128MiB,
+    _256MiB,
+    _512MiB,
+    _1GiB,
+    _2GiB,
+    _4GiB,
+    Other(u8),
+}
+
+impl Into<u8> for MemorySize {
+    fn into(self) -> u8 {
+        match self {
+            MemorySize::_1KiB => 9,
+            MemorySize::_2KiB => 10,
+            MemorySize::_4KiB => 11,
+            MemorySize::_8KiB => 12,
+            MemorySize::_16KiB => 13,
+            MemorySize::_32KiB => 14,
+            MemorySize::_64KiB => 15,
+            MemorySize::_128KiB => 16,
+            MemorySize::_256KiB => 17,
+            MemorySize::_512KiB => 18,
+            MemorySize::_1MiB => 19,
+            MemorySize::_2MiB => 20,
+            MemorySize::_4MiB => 21,
+            MemorySize::_8MiB => 22,
+            MemorySize::_16MiB => 23,
+            MemorySize::_32MiB => 24,
+            MemorySize::_64MiB => 25,
+            MemorySize::_128MiB => 26,
+            MemorySize::_256MiB => 27,
+            MemorySize::_512MiB => 28,
+            MemorySize::_1GiB => 29,
+            MemorySize::_2GiB => 30,
+            MemorySize::_4GiB => 31,
+            MemorySize::Other(val) => val,
+        }
+    }
+}
+
+#[derive(Copy, Clone)]
+pub enum AddressSize {
+    _8Bit,
+    _16Bit,
+    _24bit,
+    _32bit,
+}
+
+impl Into<u8> for AddressSize {
+    fn into(self) -> u8 {
+        match self {
+            AddressSize::_8Bit => 0b00,
+            AddressSize::_16Bit => 0b01,
+            AddressSize::_24bit => 0b10,
+            AddressSize::_32bit => 0b11,
+        }
+    }
+}
+
+#[derive(Copy, Clone)]
+pub enum ChipSelectHightTime {
+    _1Cycle,
+    _2Cycle,
+    _3Cycle,
+    _4Cycle,
+    _5Cycle,
+    _6Cycle,
+    _7Cycle,
+    _8Cycle,
+}
+
+impl Into<u8> for ChipSelectHightTime {
+    fn into(self) -> u8 {
+        match self {
+            ChipSelectHightTime::_1Cycle => 0,
+            ChipSelectHightTime::_2Cycle => 1,
+            ChipSelectHightTime::_3Cycle => 2,
+            ChipSelectHightTime::_4Cycle => 3,
+            ChipSelectHightTime::_5Cycle => 4,
+            ChipSelectHightTime::_6Cycle => 5,
+            ChipSelectHightTime::_7Cycle => 6,
+            ChipSelectHightTime::_8Cycle => 7,
+        }
+    }
+}
+
+#[derive(Copy, Clone)]
+pub enum FIFOThresholdLevel {
+    _1Bytes,
+    _2Bytes,
+    _3Bytes,
+    _4Bytes,
+    _5Bytes,
+    _6Bytes,
+    _7Bytes,
+    _8Bytes,
+    _9Bytes,
+    _10Bytes,
+    _11Bytes,
+    _12Bytes,
+    _13Bytes,
+    _14Bytes,
+    _15Bytes,
+    _16Bytes,
+    _17Bytes,
+    _18Bytes,
+    _19Bytes,
+    _20Bytes,
+    _21Bytes,
+    _22Bytes,
+    _23Bytes,
+    _24Bytes,
+    _25Bytes,
+    _26Bytes,
+    _27Bytes,
+    _28Bytes,
+    _29Bytes,
+    _30Bytes,
+    _31Bytes,
+    _32Bytes,
+}
+
+impl Into<u8> for FIFOThresholdLevel {
+    fn into(self) -> u8 {
+        match self {
+            FIFOThresholdLevel::_1Bytes => 0,
+            FIFOThresholdLevel::_2Bytes => 1,
+            FIFOThresholdLevel::_3Bytes => 2,
+            FIFOThresholdLevel::_4Bytes => 3,
+            FIFOThresholdLevel::_5Bytes => 4,
+            FIFOThresholdLevel::_6Bytes => 5,
+            FIFOThresholdLevel::_7Bytes => 6,
+            FIFOThresholdLevel::_8Bytes => 7,
+            FIFOThresholdLevel::_9Bytes => 8,
+            FIFOThresholdLevel::_10Bytes => 9,
+            FIFOThresholdLevel::_11Bytes => 10,
+            FIFOThresholdLevel::_12Bytes => 11,
+            FIFOThresholdLevel::_13Bytes => 12,
+            FIFOThresholdLevel::_14Bytes => 13,
+            FIFOThresholdLevel::_15Bytes => 14,
+            FIFOThresholdLevel::_16Bytes => 15,
+            FIFOThresholdLevel::_17Bytes => 16,
+            FIFOThresholdLevel::_18Bytes => 17,
+            FIFOThresholdLevel::_19Bytes => 18,
+            FIFOThresholdLevel::_20Bytes => 19,
+            FIFOThresholdLevel::_21Bytes => 20,
+            FIFOThresholdLevel::_22Bytes => 21,
+            FIFOThresholdLevel::_23Bytes => 22,
+            FIFOThresholdLevel::_24Bytes => 23,
+            FIFOThresholdLevel::_25Bytes => 24,
+            FIFOThresholdLevel::_26Bytes => 25,
+            FIFOThresholdLevel::_27Bytes => 26,
+            FIFOThresholdLevel::_28Bytes => 27,
+            FIFOThresholdLevel::_29Bytes => 28,
+            FIFOThresholdLevel::_30Bytes => 29,
+            FIFOThresholdLevel::_31Bytes => 30,
+            FIFOThresholdLevel::_32Bytes => 31,
+        }
+    }
+}
+
+#[derive(Copy, Clone)]
+pub enum DummyCycles {
+    _0,
+    _1,
+    _2,
+    _3,
+    _4,
+    _5,
+    _6,
+    _7,
+    _8,
+    _9,
+    _10,
+    _11,
+    _12,
+    _13,
+    _14,
+    _15,
+    _16,
+    _17,
+    _18,
+    _19,
+    _20,
+    _21,
+    _22,
+    _23,
+    _24,
+    _25,
+    _26,
+    _27,
+    _28,
+    _29,
+    _30,
+    _31,
+}
+
+impl Into<u8> for DummyCycles {
+    fn into(self) -> u8 {
+        match self {
+            DummyCycles::_0 => 0,
+            DummyCycles::_1 => 1,
+            DummyCycles::_2 => 2,
+            DummyCycles::_3 => 3,
+            DummyCycles::_4 => 4,
+            DummyCycles::_5 => 5,
+            DummyCycles::_6 => 6,
+            DummyCycles::_7 => 7,
+            DummyCycles::_8 => 8,
+            DummyCycles::_9 => 9,
+            DummyCycles::_10 => 10,
+            DummyCycles::_11 => 11,
+            DummyCycles::_12 => 12,
+            DummyCycles::_13 => 13,
+            DummyCycles::_14 => 14,
+            DummyCycles::_15 => 15,
+            DummyCycles::_16 => 16,
+            DummyCycles::_17 => 17,
+            DummyCycles::_18 => 18,
+            DummyCycles::_19 => 19,
+            DummyCycles::_20 => 20,
+            DummyCycles::_21 => 21,
+            DummyCycles::_22 => 22,
+            DummyCycles::_23 => 23,
+            DummyCycles::_24 => 24,
+            DummyCycles::_25 => 25,
+            DummyCycles::_26 => 26,
+            DummyCycles::_27 => 27,
+            DummyCycles::_28 => 28,
+            DummyCycles::_29 => 29,
+            DummyCycles::_30 => 30,
+            DummyCycles::_31 => 31,
+        }
+    }
+}
diff --git a/embassy-stm32/src/qspi/mod.rs b/embassy-stm32/src/qspi/mod.rs
new file mode 100644
index 000000000..f33319620
--- /dev/null
+++ b/embassy-stm32/src/qspi/mod.rs
@@ -0,0 +1,338 @@
+#![macro_use]
+
+pub mod enums;
+
+use embassy_hal_common::{into_ref, PeripheralRef};
+use enums::*;
+
+use crate::dma::TransferOptions;
+use crate::gpio::sealed::AFType;
+use crate::gpio::AnyPin;
+use crate::pac::quadspi::Quadspi as Regs;
+use crate::rcc::RccPeripheral;
+use crate::{peripherals, Peripheral};
+
+pub struct TransferConfig {
+    /// Instraction width (IMODE)
+    pub iwidth: QspiWidth,
+    /// Address width (ADMODE)
+    pub awidth: QspiWidth,
+    /// Data width (DMODE)
+    pub dwidth: QspiWidth,
+    /// Instruction Id
+    pub instruction: u8,
+    /// Flash memory address
+    pub address: Option<u32>,
+    /// Number of dummy cycles (DCYC)
+    pub dummy: DummyCycles,
+    /// Length of data
+    pub data_len: Option<usize>,
+}
+
+impl Default for TransferConfig {
+    fn default() -> Self {
+        Self {
+            iwidth: QspiWidth::NONE,
+            awidth: QspiWidth::NONE,
+            dwidth: QspiWidth::NONE,
+            instruction: 0,
+            address: None,
+            dummy: DummyCycles::_0,
+            data_len: None,
+        }
+    }
+}
+
+pub struct Config {
+    /// Flash memory size representend as 2^[0-32], as reasonable minimum 1KiB(9) was chosen.
+    /// If you need other value the whose predefined use `Other` variant.
+    pub memory_size: MemorySize,
+    /// Address size (8/16/24/32-bit)
+    pub address_size: AddressSize,
+    /// Scalar factor for generating CLK [0-255]
+    pub prescaler: u8,
+    /// Number of bytes to trigger FIFO threshold flag.
+    pub fifo_threshold: FIFOThresholdLevel,
+    /// Minimum number of cycles that chip select must be high between issued commands
+    pub cs_high_time: ChipSelectHightTime,
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            memory_size: MemorySize::Other(0),
+            address_size: AddressSize::_24bit,
+            prescaler: 128,
+            fifo_threshold: FIFOThresholdLevel::_17Bytes,
+            cs_high_time: ChipSelectHightTime::_5Cycle,
+        }
+    }
+}
+
+#[allow(dead_code)]
+pub struct Qspi<'d, T: Instance, Dma> {
+    _peri: PeripheralRef<'d, T>,
+    sck: Option<PeripheralRef<'d, AnyPin>>,
+    d0: Option<PeripheralRef<'d, AnyPin>>,
+    d1: Option<PeripheralRef<'d, AnyPin>>,
+    d2: Option<PeripheralRef<'d, AnyPin>>,
+    d3: Option<PeripheralRef<'d, AnyPin>>,
+    nss: Option<PeripheralRef<'d, AnyPin>>,
+    dma: PeripheralRef<'d, Dma>,
+    config: Config,
+}
+
+impl<'d, T: Instance, Dma> Qspi<'d, T, Dma> {
+    pub fn new(
+        peri: impl Peripheral<P = T> + 'd,
+        d0: impl Peripheral<P = impl D0Pin<T>> + 'd,
+        d1: impl Peripheral<P = impl D1Pin<T>> + 'd,
+        d2: impl Peripheral<P = impl D2Pin<T>> + 'd,
+        d3: impl Peripheral<P = impl D3Pin<T>> + 'd,
+        sck: impl Peripheral<P = impl SckPin<T>> + 'd,
+        nss: impl Peripheral<P = impl NSSPin<T>> + 'd,
+        dma: impl Peripheral<P = Dma> + 'd,
+        config: Config,
+    ) -> Self {
+        into_ref!(peri, d0, d1, d2, d3, sck, nss);
+
+        unsafe {
+            sck.set_as_af(sck.af_num(), AFType::OutputPushPull);
+            sck.set_speed(crate::gpio::Speed::VeryHigh);
+            nss.set_as_af(nss.af_num(), AFType::OutputPushPull);
+            nss.set_speed(crate::gpio::Speed::VeryHigh);
+            d0.set_as_af(d0.af_num(), AFType::OutputPushPull);
+            d0.set_speed(crate::gpio::Speed::VeryHigh);
+            d1.set_as_af(d1.af_num(), AFType::OutputPushPull);
+            d1.set_speed(crate::gpio::Speed::VeryHigh);
+            d2.set_as_af(d2.af_num(), AFType::OutputPushPull);
+            d2.set_speed(crate::gpio::Speed::VeryHigh);
+            d3.set_as_af(d3.af_num(), AFType::OutputPushPull);
+            d3.set_speed(crate::gpio::Speed::VeryHigh);
+        }
+
+        Self::new_inner(
+            peri,
+            Some(d0.map_into()),
+            Some(d1.map_into()),
+            Some(d2.map_into()),
+            Some(d3.map_into()),
+            Some(sck.map_into()),
+            Some(nss.map_into()),
+            dma,
+            config,
+        )
+    }
+
+    fn new_inner(
+        peri: impl Peripheral<P = T> + 'd,
+        d0: Option<PeripheralRef<'d, AnyPin>>,
+        d1: Option<PeripheralRef<'d, AnyPin>>,
+        d2: Option<PeripheralRef<'d, AnyPin>>,
+        d3: Option<PeripheralRef<'d, AnyPin>>,
+        sck: Option<PeripheralRef<'d, AnyPin>>,
+        nss: Option<PeripheralRef<'d, AnyPin>>,
+        dma: impl Peripheral<P = Dma> + 'd,
+        config: Config,
+    ) -> Self {
+        into_ref!(peri, dma);
+
+        T::enable();
+        unsafe {
+            T::REGS.cr().write(|w| w.set_fthres(config.fifo_threshold.into()));
+
+            while T::REGS.sr().read().busy() {}
+
+            T::REGS.cr().write(|w| {
+                w.set_prescaler(config.prescaler);
+                w.set_en(true);
+            });
+            T::REGS.dcr().write(|w| {
+                w.set_fsize(config.memory_size.into());
+                w.set_csht(config.cs_high_time.into());
+                w.set_ckmode(false);
+            });
+        }
+
+        Self {
+            _peri: peri,
+            sck,
+            d0,
+            d1,
+            d2,
+            d3,
+            nss,
+            dma,
+            config,
+        }
+    }
+
+    pub fn command(&mut self, transaction: TransferConfig) {
+        unsafe {
+            T::REGS.cr().modify(|v| v.set_dmaen(false));
+            self.setup_transaction(QspiMode::IndirectWrite, &transaction);
+
+            while !T::REGS.sr().read().tcf() {}
+            T::REGS.fcr().modify(|v| v.set_ctcf(true));
+        }
+    }
+
+    pub fn blocking_read(&mut self, buf: &mut [u8], transaction: TransferConfig) {
+        unsafe {
+            T::REGS.cr().modify(|v| v.set_dmaen(false));
+            self.setup_transaction(QspiMode::IndirectWrite, &transaction);
+
+            if let Some(len) = transaction.data_len {
+                let current_ar = T::REGS.ar().read().address();
+                T::REGS.ccr().modify(|v| {
+                    v.set_fmode(QspiMode::IndirectRead.into());
+                });
+                T::REGS.ar().write(|v| {
+                    v.set_address(current_ar);
+                });
+
+                for idx in 0..len {
+                    while !T::REGS.sr().read().tcf() && !T::REGS.sr().read().ftf() {}
+                    buf[idx] = *(T::REGS.dr().ptr() as *mut u8);
+                }
+            }
+
+            while !T::REGS.sr().read().tcf() {}
+            T::REGS.fcr().modify(|v| v.set_ctcf(true));
+        }
+    }
+
+    pub fn blocking_write(&mut self, buf: &[u8], transaction: TransferConfig) {
+        unsafe {
+            T::REGS.cr().modify(|v| v.set_dmaen(false));
+            self.setup_transaction(QspiMode::IndirectWrite, &transaction);
+
+            if let Some(len) = transaction.data_len {
+                T::REGS.ccr().modify(|v| {
+                    v.set_fmode(QspiMode::IndirectWrite.into());
+                });
+
+                for idx in 0..len {
+                    while !T::REGS.sr().read().ftf() {}
+                    *(T::REGS.dr().ptr() as *mut u8) = buf[idx];
+                }
+            }
+
+            while !T::REGS.sr().read().tcf() {}
+            T::REGS.fcr().modify(|v| v.set_ctcf(true));
+        }
+    }
+
+    pub fn blocking_read_dma(&mut self, buf: &mut [u8], transaction: TransferConfig)
+    where
+        Dma: QuadDma<T>,
+    {
+        unsafe {
+            self.setup_transaction(QspiMode::IndirectWrite, &transaction);
+
+            let request = self.dma.request();
+            let options = TransferOptions::default();
+
+            T::REGS.ccr().modify(|v| {
+                v.set_fmode(QspiMode::IndirectRead.into());
+            });
+            let current_ar = T::REGS.ar().read().address();
+            T::REGS.ar().write(|v| {
+                v.set_address(current_ar);
+            });
+
+            self.dma
+                .start_read(request, T::REGS.dr().ptr() as *mut u8, buf, options);
+
+            T::REGS.cr().modify(|v| v.set_dmaen(true));
+
+            while self.dma.is_running() {}
+        }
+    }
+
+    pub fn blocking_write_dma(&mut self, buf: &[u8], transaction: TransferConfig)
+    where
+        Dma: QuadDma<T>,
+    {
+        unsafe {
+            self.setup_transaction(QspiMode::IndirectWrite, &transaction);
+
+            let request = self.dma.request();
+            let options = TransferOptions::default();
+
+            T::REGS.ccr().modify(|v| {
+                v.set_fmode(QspiMode::IndirectWrite.into());
+            });
+
+            self.dma
+                .start_write(request, buf, T::REGS.dr().ptr() as *mut u8, options);
+
+            T::REGS.cr().modify(|v| v.set_dmaen(true));
+
+            while self.dma.is_running() {}
+        }
+    }
+
+    fn setup_transaction(&mut self, fmode: QspiMode, transaction: &TransferConfig) {
+        unsafe {
+            T::REGS.fcr().modify(|v| {
+                v.set_csmf(true);
+                v.set_ctcf(true);
+                v.set_ctef(true);
+                v.set_ctof(true);
+            });
+
+            while T::REGS.sr().read().busy() {}
+
+            if let Some(len) = transaction.data_len {
+                T::REGS.dlr().write(|v| v.set_dl(len as u32 - 1));
+            }
+
+            T::REGS.ccr().write(|v| {
+                v.set_fmode(fmode.into());
+                v.set_imode(transaction.iwidth.into());
+                v.set_instruction(transaction.instruction);
+                v.set_admode(transaction.awidth.into());
+                v.set_adsize(self.config.address_size.into());
+                v.set_dmode(transaction.dwidth.into());
+                v.set_abmode(QspiWidth::NONE.into());
+                v.set_dcyc(transaction.dummy.into());
+            });
+
+            if let Some(addr) = transaction.address {
+                T::REGS.ar().write(|v| {
+                    v.set_address(addr);
+                });
+            }
+        }
+    }
+}
+
+pub(crate) mod sealed {
+    use super::*;
+
+    pub trait Instance {
+        const REGS: Regs;
+    }
+}
+
+pub trait Instance: Peripheral<P = Self> + sealed::Instance + RccPeripheral {}
+
+pin_trait!(SckPin, Instance);
+pin_trait!(D0Pin, Instance);
+pin_trait!(D1Pin, Instance);
+pin_trait!(D2Pin, Instance);
+pin_trait!(D3Pin, Instance);
+pin_trait!(NSSPin, Instance);
+
+dma_trait!(QuadDma, Instance);
+
+foreach_peripheral!(
+    (quadspi, $inst:ident) => {
+        impl sealed::Instance for peripherals::$inst {
+            const REGS: Regs = crate::pac::$inst;
+        }
+
+        impl Instance for peripherals::$inst {}
+    };
+);
diff --git a/embassy-stm32/src/usart/buffered.rs b/embassy-stm32/src/usart/buffered.rs
index a27fcc1ca..cd7d72f91 100644
--- a/embassy-stm32/src/usart/buffered.rs
+++ b/embassy-stm32/src/usart/buffered.rs
@@ -197,6 +197,40 @@ impl<'d, T: BasicInstance> BufferedUart<'d, T> {
         .await
     }
 
+    fn inner_blocking_read(&self, buf: &mut [u8]) -> Result<usize, Error> {
+        loop {
+            let mut do_pend = false;
+            let mut inner = self.inner.borrow_mut();
+            let n = inner.with(|state| {
+                compiler_fence(Ordering::SeqCst);
+
+                // We have data ready in buffer? Return it.
+                let data = state.rx.pop_buf();
+                if !data.is_empty() {
+                    let len = data.len().min(buf.len());
+                    buf[..len].copy_from_slice(&data[..len]);
+
+                    if state.rx.is_full() {
+                        do_pend = true;
+                    }
+                    state.rx.pop(len);
+
+                    return len;
+                }
+
+                0
+            });
+
+            if do_pend {
+                inner.pend();
+            }
+
+            if n > 0 {
+                return Ok(n);
+            }
+        }
+    }
+
     async fn inner_write<'a>(&'a self, buf: &'a [u8]) -> Result<usize, Error> {
         poll_fn(move |cx| {
             let mut inner = self.inner.borrow_mut();
@@ -236,6 +270,39 @@ impl<'d, T: BasicInstance> BufferedUart<'d, T> {
         .await
     }
 
+    fn inner_blocking_write(&self, buf: &[u8]) -> Result<usize, Error> {
+        loop {
+            let mut inner = self.inner.borrow_mut();
+            let (n, empty) = inner.with(|state| {
+                let empty = state.tx.is_empty();
+                let tx_buf = state.tx.push_buf();
+                if tx_buf.is_empty() {
+                    return (0, empty);
+                }
+
+                let n = core::cmp::min(tx_buf.len(), buf.len());
+                tx_buf[..n].copy_from_slice(&buf[..n]);
+                state.tx.push(n);
+
+                (n, empty)
+            });
+            if empty {
+                inner.pend();
+            }
+            if n != 0 {
+                return Ok(n);
+            }
+        }
+    }
+
+    fn inner_blocking_flush(&self) -> Result<(), Error> {
+        loop {
+            if !self.inner.borrow_mut().with(|state| state.tx.is_empty()) {
+                return Ok(());
+            }
+        }
+    }
+
     async fn inner_fill_buf<'a>(&'a self) -> Result<&'a [u8], Error> {
         poll_fn(move |cx| {
             self.inner.borrow_mut().with(|state| {
@@ -419,3 +486,35 @@ impl<'u, 'd, T: BasicInstance> embedded_io::asynch::Write for BufferedUartTx<'u,
         self.inner.inner_flush().await
     }
 }
+
+impl<'d, T: BasicInstance> embedded_io::blocking::Read for BufferedUart<'d, T> {
+    fn read(&mut self, buf: &mut [u8]) -> Result<usize, Self::Error> {
+        self.inner_blocking_read(buf)
+    }
+}
+
+impl<'u, 'd, T: BasicInstance> embedded_io::blocking::Read for BufferedUartRx<'u, 'd, T> {
+    fn read(&mut self, buf: &mut [u8]) -> Result<usize, Self::Error> {
+        self.inner.inner_blocking_read(buf)
+    }
+}
+
+impl<'d, T: BasicInstance> embedded_io::blocking::Write for BufferedUart<'d, T> {
+    fn write(&mut self, buf: &[u8]) -> Result<usize, Self::Error> {
+        self.inner_blocking_write(buf)
+    }
+
+    fn flush(&mut self) -> Result<(), Self::Error> {
+        self.inner_blocking_flush()
+    }
+}
+
+impl<'u, 'd, T: BasicInstance> embedded_io::blocking::Write for BufferedUartTx<'u, 'd, T> {
+    fn write(&mut self, buf: &[u8]) -> Result<usize, Self::Error> {
+        self.inner.inner_blocking_write(buf)
+    }
+
+    fn flush(&mut self) -> Result<(), Self::Error> {
+        self.inner.inner_blocking_flush()
+    }
+}
diff --git a/embassy-sync/src/pipe.rs b/embassy-sync/src/pipe.rs
index 1977005fb..ee27cdec8 100644
--- a/embassy-sync/src/pipe.rs
+++ b/embassy-sync/src/pipe.rs
@@ -32,16 +32,16 @@ impl<'p, M, const N: usize> Writer<'p, M, N>
 where
     M: RawMutex,
 {
-    /// Writes a value.
+    /// Write some bytes to the pipe.
     ///
     /// See [`Pipe::write()`]
     pub fn write<'a>(&'a self, buf: &'a [u8]) -> WriteFuture<'a, M, N> {
         self.pipe.write(buf)
     }
 
-    /// Attempt to immediately write a message.
+    /// Attempt to immediately write some bytes to the pipe.
     ///
-    /// See [`Pipe::write()`]
+    /// See [`Pipe::try_write()`]
     pub fn try_write(&self, buf: &[u8]) -> Result<usize, TryWriteError> {
         self.pipe.try_write(buf)
     }
@@ -95,16 +95,16 @@ impl<'p, M, const N: usize> Reader<'p, M, N>
 where
     M: RawMutex,
 {
-    /// Reads a value.
+    /// Read some bytes from the pipe.
     ///
     /// See [`Pipe::read()`]
     pub fn read<'a>(&'a self, buf: &'a mut [u8]) -> ReadFuture<'a, M, N> {
         self.pipe.read(buf)
     }
 
-    /// Attempt to immediately read a message.
+    /// Attempt to immediately read some bytes from the pipe.
     ///
-    /// See [`Pipe::read()`]
+    /// See [`Pipe::try_read()`]
     pub fn try_read(&self, buf: &mut [u8]) -> Result<usize, TryReadError> {
         self.pipe.try_read(buf)
     }
@@ -221,12 +221,11 @@ impl<const N: usize> PipeState<N> {
     }
 }
 
-/// A bounded pipe for communicating between asynchronous tasks
+/// A bounded byte-oriented pipe for communicating between asynchronous tasks
 /// with backpressure.
 ///
-/// The pipe will buffer up to the provided number of messages.  Once the
-/// buffer is full, attempts to `write` new messages will wait until a message is
-/// read from the pipe.
+/// The pipe will buffer up to the provided number of bytes. Once the
+/// buffer is full, attempts to `write` new bytes will wait until buffer space is freed up.
 ///
 /// All data written will become available in the same order as it was written.
 pub struct Pipe<M, const N: usize>
@@ -277,40 +276,56 @@ where
         Reader { pipe: self }
     }
 
-    /// Write a value, waiting until there is capacity.
+    /// Write some bytes to the pipe.
     ///
-    /// Writeing completes when the value has been pushed to the pipe's queue.
-    /// This doesn't mean the value has been read yet.
+    /// This method writes a nonzero amount of bytes from `buf` into the pipe, and
+    /// returns the amount of bytes written.
+    ///
+    /// If it is not possible to write a nonzero amount of bytes because the pipe's buffer is full,
+    /// this method will wait until it is. See [`try_write`](Self::try_write) for a variant that
+    /// returns an error instead of waiting.
+    ///
+    /// It is not guaranteed that all bytes in the buffer are written, even if there's enough
+    /// free space in the pipe buffer for all. In other words, it is possible for `write` to return
+    /// without writing all of `buf` (returning a number less than `buf.len()`) and still leave
+    /// free space in the pipe buffer. You should always `write` in a loop, or use helpers like
+    /// `write_all` from the `embedded-io` crate.
     pub fn write<'a>(&'a self, buf: &'a [u8]) -> WriteFuture<'a, M, N> {
         WriteFuture { pipe: self, buf }
     }
 
-    /// Attempt to immediately write a message.
+    /// Attempt to immediately write some bytes to the pipe.
     ///
-    /// This method differs from [`write`](Pipe::write) by returning immediately if the pipe's
-    /// buffer is full, instead of waiting.
-    ///
-    /// # Errors
-    ///
-    /// If the pipe capacity has been reached, i.e., the pipe has `n`
-    /// buffered values where `n` is the argument passed to [`Pipe`], then an
-    /// error is returned.
+    /// This method will either write a nonzero amount of bytes to the pipe immediately,
+    /// or return an error if the pipe is empty. See [`write`](Self::write) for a variant
+    /// that waits instead of returning an error.
     pub fn try_write(&self, buf: &[u8]) -> Result<usize, TryWriteError> {
         self.lock(|c| c.try_write(buf))
     }
 
-    /// Receive the next value.
+    /// Read some bytes from the pipe.
     ///
-    /// If there are no messages in the pipe's buffer, this method will
-    /// wait until a message is written.
+    /// This method reads a nonzero amount of bytes from the pipe into `buf` and
+    /// returns the amount of bytes read.
+    ///
+    /// If it is not possible to read a nonzero amount of bytes because the pipe's buffer is empty,
+    /// this method will wait until it is. See [`try_read`](Self::try_read) for a variant that
+    /// returns an error instead of waiting.
+    ///
+    /// It is not guaranteed that all bytes in the buffer are read, even if there's enough
+    /// space in `buf` for all. In other words, it is possible for `read` to return
+    /// without filling `buf` (returning a number less than `buf.len()`) and still leave bytes
+    /// in the pipe buffer. You should always `read` in a loop, or use helpers like
+    /// `read_exact` from the `embedded-io` crate.
     pub fn read<'a>(&'a self, buf: &'a mut [u8]) -> ReadFuture<'a, M, N> {
         ReadFuture { pipe: self, buf }
     }
 
-    /// Attempt to immediately read a message.
+    /// Attempt to immediately read some bytes from the pipe.
     ///
-    /// This method will either read a message from the pipe immediately or return an error
-    /// if the pipe is empty.
+    /// This method will either read a nonzero amount of bytes from the pipe immediately,
+    /// or return an error if the pipe is empty. See [`read`](Self::read) for a variant
+    /// that waits instead of returning an error.
     pub fn try_read(&self, buf: &mut [u8]) -> Result<usize, TryReadError> {
         self.lock(|c| c.try_read(buf))
     }
diff --git a/embassy-usb/src/builder.rs b/embassy-usb/src/builder.rs
index 305dfa02e..6b68bcd7b 100644
--- a/embassy-usb/src/builder.rs
+++ b/embassy-usb/src/builder.rs
@@ -201,6 +201,14 @@ impl<'d, D: Driver<'d>> Builder<'d, D> {
         self.config_descriptor.end_configuration();
         self.bos_descriptor.end_bos();
 
+        // Log the number of allocator bytes actually used in descriptor buffers
+        info!("USB: device_descriptor used: {}", self.device_descriptor.position());
+        info!("USB: config_descriptor used: {}", self.config_descriptor.position());
+        info!("USB: bos_descriptor used: {}", self.bos_descriptor.writer.position());
+        #[cfg(feature = "msos-descriptor")]
+        info!("USB: msos_descriptor used: {}", msos_descriptor.len());
+        info!("USB: control_buf size: {}", self.control_buf.len());
+
         UsbDevice::build(
             self.driver,
             self.config,
diff --git a/embassy-usb/src/class/hid.rs b/embassy-usb/src/class/hid.rs
index 974268c62..03e4c1dbb 100644
--- a/embassy-usb/src/class/hid.rs
+++ b/embassy-usb/src/class/hid.rs
@@ -458,6 +458,9 @@ impl<'d> Handler for Control<'d> {
             return None;
         }
 
+        // This uses a defmt-specific formatter that causes use of the `log`
+        // feature to fail to build, so leave it defmt-specific for now.
+        #[cfg(feature = "defmt")]
         trace!("HID control_out {:?} {=[u8]:x}", req, data);
         match req.request {
             HID_REQ_SET_IDLE => {
diff --git a/embassy-usb/src/lib.rs b/embassy-usb/src/lib.rs
index bfeccd5fe..3016b81cb 100644
--- a/embassy-usb/src/lib.rs
+++ b/embassy-usb/src/lib.rs
@@ -165,6 +165,25 @@ struct Interface {
     num_alt_settings: u8,
 }
 
+/// A report of the used size of the runtime allocated buffers
+#[derive(PartialEq, Eq, Copy, Clone, Debug)]
+#[cfg_attr(feature = "defmt", derive(defmt::Format))]
+pub struct UsbBufferReport {
+    /// Number of device descriptor bytes used
+    pub device_descriptor_used: usize,
+    /// Number of config descriptor bytes used
+    pub config_descriptor_used: usize,
+    /// Number of bos descriptor bytes used
+    pub bos_descriptor_used: usize,
+    /// Number of msos descriptor bytes used
+    ///
+    /// Will be `None` if the "msos-descriptor" feature is not active.
+    /// Otherwise will return Some(bytes).
+    pub msos_descriptor_used: Option<usize>,
+    /// Size of the control buffer
+    pub control_buffer_size: usize,
+}
+
 /// Main struct for the USB device stack.
 pub struct UsbDevice<'d, D: Driver<'d>> {
     control_buf: &'d mut [u8],
@@ -239,6 +258,24 @@ impl<'d, D: Driver<'d>> UsbDevice<'d, D> {
         }
     }
 
+    /// Returns a report of the consumed buffers
+    ///
+    /// Useful for tuning buffer sizes for actual usage
+    pub fn buffer_usage(&self) -> UsbBufferReport {
+        #[cfg(not(feature = "msos-descriptor"))]
+        let mdu = None;
+        #[cfg(feature = "msos-descriptor")]
+        let mdu = Some(self.inner.msos_descriptor.len());
+
+        UsbBufferReport {
+            device_descriptor_used: self.inner.device_descriptor.len(),
+            config_descriptor_used: self.inner.config_descriptor.len(),
+            bos_descriptor_used: self.inner.bos_descriptor.len(),
+            msos_descriptor_used: mdu,
+            control_buffer_size: self.control_buf.len(),
+        }
+    }
+
     /// Runs the `UsbDevice` forever.
     ///
     /// This future may leave the bus in an invalid state if it is dropped.
diff --git a/embassy-usb/src/msos.rs b/embassy-usb/src/msos.rs
index b1e0335ee..218d9931a 100644
--- a/embassy-usb/src/msos.rs
+++ b/embassy-usb/src/msos.rs
@@ -32,6 +32,11 @@ impl<'d> MsOsDescriptorSet<'d> {
     pub fn is_empty(&self) -> bool {
         self.descriptor.is_empty()
     }
+
+    /// Returns the length of the descriptor field
+    pub fn len(&self) -> usize {
+        self.descriptor.len()
+    }
 }
 
 /// Writes a Microsoft OS 2.0 Descriptor set into a buffer.
diff --git a/tests/rp/src/bin/spi_async.rs b/tests/rp/src/bin/spi_async.rs
index 6c85ef60a..2e22c9de7 100644
--- a/tests/rp/src/bin/spi_async.rs
+++ b/tests/rp/src/bin/spi_async.rs
@@ -1,3 +1,6 @@
+//! Make sure to connect GPIO pins 3 (`PIN_3`) and 4 (`PIN_4`) together
+//! to run this test.
+//!
 #![no_std]
 #![no_main]
 #![feature(type_alias_impl_trait)]
@@ -18,10 +21,63 @@ async fn main(_spawner: Spawner) {
 
     let mut spi = Spi::new(p.SPI0, clk, mosi, miso, p.DMA_CH0, p.DMA_CH1, Config::default());
 
-    let tx_buf = [1_u8, 2, 3, 4, 5, 6];
-    let mut rx_buf = [0_u8; 6];
-    spi.transfer(&mut rx_buf, &tx_buf).await.unwrap();
-    assert_eq!(rx_buf, tx_buf);
+    // equal rx & tx buffers
+    {
+        let tx_buf = [1_u8, 2, 3, 4, 5, 6];
+        let mut rx_buf = [0_u8; 6];
+        spi.transfer(&mut rx_buf, &tx_buf).await.unwrap();
+        assert_eq!(rx_buf, tx_buf);
+    }
+
+    // tx > rx buffer
+    {
+        let tx_buf = [7_u8, 8, 9, 10, 11, 12];
+
+        let mut rx_buf = [0_u8; 3];
+        spi.transfer(&mut rx_buf, &tx_buf).await.unwrap();
+        assert_eq!(rx_buf, tx_buf[..3]);
+
+        defmt::info!("tx > rx buffer - OK");
+    }
+
+    // we make sure to that clearing FIFO works after the uneven buffers
+
+    // equal rx & tx buffers
+    {
+        let tx_buf = [13_u8, 14, 15, 16, 17, 18];
+        let mut rx_buf = [0_u8; 6];
+        spi.transfer(&mut rx_buf, &tx_buf).await.unwrap();
+        assert_eq!(rx_buf, tx_buf);
+
+        defmt::info!("buffer rx length == tx length - OK");
+    }
+
+    // rx > tx buffer
+    {
+        let tx_buf = [19_u8, 20, 21];
+        let mut rx_buf = [0_u8; 6];
+
+        // we should have written dummy data to tx buffer to sync clock.
+        spi.transfer(&mut rx_buf, &tx_buf).await.unwrap();
+
+        assert_eq!(
+            rx_buf[..3],
+            tx_buf,
+            "only the first 3 TX bytes should have been received in the RX buffer"
+        );
+        assert_eq!(rx_buf[3..], [0, 0, 0], "the rest of the RX bytes should be empty");
+        defmt::info!("buffer rx length > tx length - OK");
+    }
+
+    // equal rx & tx buffers
+    {
+        let tx_buf = [22_u8, 23, 24, 25, 26, 27];
+        let mut rx_buf = [0_u8; 6];
+        spi.transfer(&mut rx_buf, &tx_buf).await.unwrap();
+
+        assert_eq!(rx_buf, tx_buf);
+        defmt::info!("buffer rx length = tx length - OK");
+    }
 
     info!("Test OK");
     cortex_m::asm::bkpt();