From 183f2f6913032600d74ea058b50a1fcedbebe719 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20=C5=A0pa=C4=8Dek?= <patek.mail@gmail.com>
Date: Thu, 18 Apr 2024 18:50:30 +0200
Subject: [PATCH] stm32/usart: remove instance generic params

---
 embassy-stm32/build.rs                       |   6 +-
 embassy-stm32/src/usart/buffered.rs          | 372 ++++++++-------
 embassy-stm32/src/usart/mod.rs               | 456 ++++++++++---------
 embassy-stm32/src/usart/ringbuffered.rs      |  57 +--
 examples/stm32h5/src/bin/usart_split.rs      |   3 +-
 examples/stm32h7/src/bin/usart_split.rs      |   3 +-
 examples/stm32h7rs/src/bin/usart_split.rs    |   3 +-
 tests/stm32/src/bin/usart_rx_ringbuffered.rs |   4 +-
 8 files changed, 470 insertions(+), 434 deletions(-)

diff --git a/embassy-stm32/build.rs b/embassy-stm32/build.rs
index 4eed6fe7d..7bf6ffba2 100644
--- a/embassy-stm32/build.rs
+++ b/embassy-stm32/build.rs
@@ -387,7 +387,6 @@ fn main() {
     struct ClockGen<'a> {
         rcc_registers: &'a PeripheralRegisters,
         chained_muxes: HashMap<&'a str, &'a PeripheralRccRegister>,
-        force_refcount: HashSet<&'a str>,
 
         refcount_statics: BTreeSet<Ident>,
         clock_names: BTreeSet<String>,
@@ -397,7 +396,6 @@ fn main() {
     let mut clock_gen = ClockGen {
         rcc_registers,
         chained_muxes: HashMap::new(),
-        force_refcount: HashSet::from(["usart"]),
 
         refcount_statics: BTreeSet::new(),
         clock_names: BTreeSet::new(),
@@ -542,7 +540,6 @@ fn main() {
                 None => (TokenStream::new(), TokenStream::new()),
             };
 
-            let ptype = if let Some(reg) = &p.registers { reg.kind } else { "" };
             let pname = format_ident!("{}", p.name);
             let en_reg = format_ident!("{}", en.register.to_ascii_lowercase());
             let set_en_field = format_ident!("set_{}", en.field.to_ascii_lowercase());
@@ -570,8 +567,7 @@ fn main() {
             };
             let en_bit_offs: u8 = en_bit_offs.offset.try_into().unwrap();
 
-            let refcount =
-                clock_gen.force_refcount.contains(ptype) || *rcc_field_count.get(&(en.register, en.field)).unwrap() > 1;
+            let refcount = *rcc_field_count.get(&(en.register, en.field)).unwrap() > 1;
             let (before_enable, before_disable) = if refcount {
                 let refcount_static =
                     format_ident!("{}_{}", en.register.to_ascii_uppercase(), en.field.to_ascii_uppercase());
diff --git a/embassy-stm32/src/usart/buffered.rs b/embassy-stm32/src/usart/buffered.rs
index 52011cd1f..492ad334b 100644
--- a/embassy-stm32/src/usart/buffered.rs
+++ b/embassy-stm32/src/usart/buffered.rs
@@ -1,7 +1,7 @@
 use core::future::poll_fn;
 use core::marker::PhantomData;
 use core::slice;
-use core::sync::atomic::{AtomicBool, Ordering};
+use core::sync::atomic::{AtomicBool, AtomicU8, Ordering};
 use core::task::Poll;
 
 use embassy_embedded_hal::SetConfig;
@@ -12,154 +12,163 @@ use embassy_sync::waitqueue::AtomicWaker;
 #[cfg(not(any(usart_v1, usart_v2)))]
 use super::DePin;
 use super::{
-    clear_interrupt_flags, configure, rdr, reconfigure, sr, tdr, BasicInstance, Config, ConfigError, CtsPin, Error,
-    RtsPin, RxPin, TxPin,
+    clear_interrupt_flags, configure, rdr, reconfigure, sr, tdr, Config, ConfigError, CtsPin, Error, Info, Instance,
+    Regs, RtsPin, RxPin, TxPin,
 };
 use crate::gpio::AFType;
-use crate::interrupt;
-use crate::interrupt::typelevel::Interrupt;
+use crate::interrupt::typelevel::Interrupt as _;
+use crate::interrupt::{self, InterruptExt};
+use crate::time::Hertz;
 
 /// Interrupt handler.
-pub struct InterruptHandler<T: BasicInstance> {
+pub struct InterruptHandler<T: Instance> {
     _phantom: PhantomData<T>,
 }
 
-impl<T: BasicInstance> interrupt::typelevel::Handler<T::Interrupt> for InterruptHandler<T> {
+impl<T: Instance> interrupt::typelevel::Handler<T::Interrupt> for InterruptHandler<T> {
     unsafe fn on_interrupt() {
-        let r = T::regs();
-        let state = T::buffered_state();
+        on_interrupt(T::info().regs, T::buffered_state())
+    }
+}
 
-        // RX
-        let sr_val = sr(r).read();
-        // On v1 & v2, reading DR clears the rxne, error and idle interrupt
-        // flags. Keep this close to the SR read to reduce the chance of a
-        // flag being set in-between.
-        let dr = if sr_val.rxne() || cfg!(any(usart_v1, usart_v2)) && (sr_val.ore() || sr_val.idle()) {
-            Some(rdr(r).read_volatile())
+unsafe fn on_interrupt(r: Regs, state: &'static State) {
+    // RX
+    let sr_val = sr(r).read();
+    // On v1 & v2, reading DR clears the rxne, error and idle interrupt
+    // flags. Keep this close to the SR read to reduce the chance of a
+    // flag being set in-between.
+    let dr = if sr_val.rxne() || cfg!(any(usart_v1, usart_v2)) && (sr_val.ore() || sr_val.idle()) {
+        Some(rdr(r).read_volatile())
+    } else {
+        None
+    };
+    clear_interrupt_flags(r, sr_val);
+
+    if sr_val.pe() {
+        warn!("Parity error");
+    }
+    if sr_val.fe() {
+        warn!("Framing error");
+    }
+    if sr_val.ne() {
+        warn!("Noise error");
+    }
+    if sr_val.ore() {
+        warn!("Overrun error");
+    }
+    if sr_val.rxne() {
+        let mut rx_writer = state.rx_buf.writer();
+        let buf = rx_writer.push_slice();
+        if !buf.is_empty() {
+            if let Some(byte) = dr {
+                buf[0] = byte;
+                rx_writer.push_done(1);
+            }
         } else {
-            None
-        };
-        clear_interrupt_flags(r, sr_val);
-
-        if sr_val.pe() {
-            warn!("Parity error");
-        }
-        if sr_val.fe() {
-            warn!("Framing error");
-        }
-        if sr_val.ne() {
-            warn!("Noise error");
-        }
-        if sr_val.ore() {
-            warn!("Overrun error");
-        }
-        if sr_val.rxne() {
-            let mut rx_writer = state.rx_buf.writer();
-            let buf = rx_writer.push_slice();
-            if !buf.is_empty() {
-                if let Some(byte) = dr {
-                    buf[0] = byte;
-                    rx_writer.push_done(1);
-                }
-            } else {
-                // FIXME: Should we disable any further RX interrupts when the buffer becomes full.
-            }
-
-            if !state.rx_buf.is_empty() {
-                state.rx_waker.wake();
-            }
+            // FIXME: Should we disable any further RX interrupts when the buffer becomes full.
         }
 
-        if sr_val.idle() {
+        if !state.rx_buf.is_empty() {
             state.rx_waker.wake();
         }
+    }
 
-        // With `usart_v4` hardware FIFO is enabled and Transmission complete (TC)
-        // indicates that all bytes are pushed out from the FIFO.
-        // For other usart variants it shows that last byte from the buffer was just sent.
-        if sr_val.tc() {
-            // For others it is cleared above with `clear_interrupt_flags`.
-            #[cfg(any(usart_v1, usart_v2))]
-            sr(r).modify(|w| w.set_tc(false));
+    if sr_val.idle() {
+        state.rx_waker.wake();
+    }
 
+    // With `usart_v4` hardware FIFO is enabled and Transmission complete (TC)
+    // indicates that all bytes are pushed out from the FIFO.
+    // For other usart variants it shows that last byte from the buffer was just sent.
+    if sr_val.tc() {
+        // For others it is cleared above with `clear_interrupt_flags`.
+        #[cfg(any(usart_v1, usart_v2))]
+        sr(r).modify(|w| w.set_tc(false));
+
+        r.cr1().modify(|w| {
+            w.set_tcie(false);
+        });
+
+        state.tx_done.store(true, Ordering::Release);
+        state.tx_waker.wake();
+    }
+
+    // TX
+    if sr(r).read().txe() {
+        let mut tx_reader = state.tx_buf.reader();
+        let buf = tx_reader.pop_slice();
+        if !buf.is_empty() {
             r.cr1().modify(|w| {
-                w.set_tcie(false);
+                w.set_txeie(true);
             });
 
-            state.tx_done.store(true, Ordering::Release);
-            state.tx_waker.wake();
-        }
-
-        // TX
-        if sr(r).read().txe() {
-            let mut tx_reader = state.tx_buf.reader();
-            let buf = tx_reader.pop_slice();
-            if !buf.is_empty() {
+            // Enable transmission complete interrupt when last byte is going to be sent out.
+            if buf.len() == 1 {
                 r.cr1().modify(|w| {
-                    w.set_txeie(true);
-                });
-
-                // Enable transmission complete interrupt when last byte is going to be sent out.
-                if buf.len() == 1 {
-                    r.cr1().modify(|w| {
-                        w.set_tcie(true);
-                    });
-                }
-
-                tdr(r).write_volatile(buf[0].into());
-                tx_reader.pop_done(1);
-            } else {
-                // Disable interrupt until we have something to transmit again.
-                r.cr1().modify(|w| {
-                    w.set_txeie(false);
+                    w.set_tcie(true);
                 });
             }
+
+            tdr(r).write_volatile(buf[0].into());
+            tx_reader.pop_done(1);
+        } else {
+            // Disable interrupt until we have something to transmit again.
+            r.cr1().modify(|w| {
+                w.set_txeie(false);
+            });
         }
     }
 }
 
-pub(crate) struct State {
-    pub(crate) rx_waker: AtomicWaker,
-    pub(crate) rx_buf: RingBuffer,
-    pub(crate) tx_waker: AtomicWaker,
-    pub(crate) tx_buf: RingBuffer,
-    pub(crate) tx_done: AtomicBool,
+pub(super) struct State {
+    rx_waker: AtomicWaker,
+    rx_buf: RingBuffer,
+    tx_waker: AtomicWaker,
+    tx_buf: RingBuffer,
+    tx_done: AtomicBool,
+    tx_rx_refcount: AtomicU8,
 }
 
 impl State {
-    /// Create new state
-    pub(crate) const fn new() -> Self {
+    pub(super) const fn new() -> Self {
         Self {
             rx_buf: RingBuffer::new(),
             tx_buf: RingBuffer::new(),
             rx_waker: AtomicWaker::new(),
             tx_waker: AtomicWaker::new(),
             tx_done: AtomicBool::new(true),
+            tx_rx_refcount: AtomicU8::new(0),
         }
     }
 }
 
 /// Bidirectional buffered UART
-pub struct BufferedUart<'d, T: BasicInstance> {
-    rx: BufferedUartRx<'d, T>,
-    tx: BufferedUartTx<'d, T>,
+pub struct BufferedUart<'d> {
+    rx: BufferedUartRx<'d>,
+    tx: BufferedUartTx<'d>,
 }
 
 /// Tx-only buffered UART
 ///
 /// Created with [BufferedUart::split]
-pub struct BufferedUartTx<'d, T: BasicInstance> {
-    phantom: PhantomData<&'d mut T>,
+pub struct BufferedUartTx<'d> {
+    info: &'static Info,
+    state: &'static State,
+    kernel_clock: Hertz,
+    _phantom: PhantomData<&'d mut ()>,
 }
 
 /// Rx-only buffered UART
 ///
 /// Created with [BufferedUart::split]
-pub struct BufferedUartRx<'d, T: BasicInstance> {
-    phantom: PhantomData<&'d mut T>,
+pub struct BufferedUartRx<'d> {
+    info: &'static Info,
+    state: &'static State,
+    kernel_clock: Hertz,
+    _phantom: PhantomData<&'d mut ()>,
 }
 
-impl<'d, T: BasicInstance> SetConfig for BufferedUart<'d, T> {
+impl<'d> SetConfig for BufferedUart<'d> {
     type Config = Config;
     type ConfigError = ConfigError;
 
@@ -168,7 +177,7 @@ impl<'d, T: BasicInstance> SetConfig for BufferedUart<'d, T> {
     }
 }
 
-impl<'d, T: BasicInstance> SetConfig for BufferedUartRx<'d, T> {
+impl<'d> SetConfig for BufferedUartRx<'d> {
     type Config = Config;
     type ConfigError = ConfigError;
 
@@ -177,7 +186,7 @@ impl<'d, T: BasicInstance> SetConfig for BufferedUartRx<'d, T> {
     }
 }
 
-impl<'d, T: BasicInstance> SetConfig for BufferedUartTx<'d, T> {
+impl<'d> SetConfig for BufferedUartTx<'d> {
     type Config = Config;
     type ConfigError = ConfigError;
 
@@ -186,9 +195,9 @@ impl<'d, T: BasicInstance> SetConfig for BufferedUartTx<'d, T> {
     }
 }
 
-impl<'d, T: BasicInstance> BufferedUart<'d, T> {
+impl<'d> BufferedUart<'d> {
     /// Create a new bidirectional buffered UART driver
-    pub fn new(
+    pub fn new<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         _irq: impl interrupt::typelevel::Binding<T::Interrupt, InterruptHandler<T>> + 'd,
         rx: impl Peripheral<P = impl RxPin<T>> + 'd,
@@ -197,15 +206,13 @@ impl<'d, T: BasicInstance> BufferedUart<'d, T> {
         rx_buffer: &'d mut [u8],
         config: Config,
     ) -> Result<Self, ConfigError> {
-        // UartRx and UartTx have one refcount ea.
-        T::enable_and_reset();
         T::enable_and_reset();
 
         Self::new_inner(peri, rx, tx, tx_buffer, rx_buffer, config)
     }
 
     /// Create a new bidirectional buffered UART driver with request-to-send and clear-to-send pins
-    pub fn new_with_rtscts(
+    pub fn new_with_rtscts<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         _irq: impl interrupt::typelevel::Binding<T::Interrupt, InterruptHandler<T>> + 'd,
         rx: impl Peripheral<P = impl RxPin<T>> + 'd,
@@ -218,13 +225,11 @@ impl<'d, T: BasicInstance> BufferedUart<'d, T> {
     ) -> Result<Self, ConfigError> {
         into_ref!(cts, rts);
 
-        // UartRx and UartTx have one refcount ea.
-        T::enable_and_reset();
         T::enable_and_reset();
 
         rts.set_as_af(rts.af_num(), AFType::OutputPushPull);
         cts.set_as_af(cts.af_num(), AFType::Input);
-        T::regs().cr3().write(|w| {
+        T::info().regs.cr3().write(|w| {
             w.set_rtse(true);
             w.set_ctse(true);
         });
@@ -234,7 +239,7 @@ impl<'d, T: BasicInstance> BufferedUart<'d, T> {
 
     /// Create a new bidirectional buffered UART driver with a driver-enable pin
     #[cfg(not(any(usart_v1, usart_v2)))]
-    pub fn new_with_de(
+    pub fn new_with_de<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         _irq: impl interrupt::typelevel::Binding<T::Interrupt, InterruptHandler<T>> + 'd,
         rx: impl Peripheral<P = impl RxPin<T>> + 'd,
@@ -246,19 +251,17 @@ impl<'d, T: BasicInstance> BufferedUart<'d, T> {
     ) -> Result<Self, ConfigError> {
         into_ref!(de);
 
-        // UartRx and UartTx have one refcount ea.
-        T::enable_and_reset();
         T::enable_and_reset();
 
         de.set_as_af(de.af_num(), AFType::OutputPushPull);
-        T::regs().cr3().write(|w| {
+        T::info().regs.cr3().write(|w| {
             w.set_dem(true);
         });
 
         Self::new_inner(peri, rx, tx, tx_buffer, rx_buffer, config)
     }
 
-    fn new_inner(
+    fn new_inner<T: Instance>(
         _peri: impl Peripheral<P = T> + 'd,
         rx: impl Peripheral<P = impl RxPin<T>> + 'd,
         tx: impl Peripheral<P = impl TxPin<T>> + 'd,
@@ -268,17 +271,19 @@ impl<'d, T: BasicInstance> BufferedUart<'d, T> {
     ) -> Result<Self, ConfigError> {
         into_ref!(_peri, rx, tx);
 
+        let info = T::info();
         let state = T::buffered_state();
+        let kernel_clock = T::frequency();
         let len = tx_buffer.len();
         unsafe { state.tx_buf.init(tx_buffer.as_mut_ptr(), len) };
         let len = rx_buffer.len();
         unsafe { state.rx_buf.init(rx_buffer.as_mut_ptr(), len) };
 
-        let r = T::regs();
+        let r = info.regs;
         rx.set_as_af(rx.af_num(), AFType::Input);
         tx.set_as_af(tx.af_num(), AFType::OutputPushPull);
 
-        configure(r, &config, T::frequency(), T::KIND, true, true)?;
+        configure(info, kernel_clock, &config, true, true)?;
 
         r.cr1().modify(|w| {
             w.set_rxneie(true);
@@ -288,22 +293,34 @@ impl<'d, T: BasicInstance> BufferedUart<'d, T> {
         T::Interrupt::unpend();
         unsafe { T::Interrupt::enable() };
 
+        state.tx_rx_refcount.store(2, Ordering::Relaxed);
+
         Ok(Self {
-            rx: BufferedUartRx { phantom: PhantomData },
-            tx: BufferedUartTx { phantom: PhantomData },
+            rx: BufferedUartRx {
+                info,
+                state,
+                kernel_clock,
+                _phantom: PhantomData,
+            },
+            tx: BufferedUartTx {
+                info,
+                state,
+                kernel_clock,
+                _phantom: PhantomData,
+            },
         })
     }
 
     /// Split the driver into a Tx and Rx part (useful for sending to separate tasks)
-    pub fn split(self) -> (BufferedUartTx<'d, T>, BufferedUartRx<'d, T>) {
+    pub fn split(self) -> (BufferedUartTx<'d>, BufferedUartRx<'d>) {
         (self.tx, self.rx)
     }
 
     /// Reconfigure the driver
     pub fn set_config(&mut self, config: &Config) -> Result<(), ConfigError> {
-        reconfigure::<T>(config)?;
+        reconfigure(self.rx.info, self.rx.kernel_clock, config)?;
 
-        T::regs().cr1().modify(|w| {
+        self.rx.info.regs.cr1().modify(|w| {
             w.set_rxneie(true);
             w.set_idleie(true);
         });
@@ -312,10 +329,10 @@ impl<'d, T: BasicInstance> BufferedUart<'d, T> {
     }
 }
 
-impl<'d, T: BasicInstance> BufferedUartRx<'d, T> {
+impl<'d> BufferedUartRx<'d> {
     async fn read(&self, buf: &mut [u8]) -> Result<usize, Error> {
         poll_fn(move |cx| {
-            let state = T::buffered_state();
+            let state = self.state;
             let mut rx_reader = unsafe { state.rx_buf.reader() };
             let data = rx_reader.pop_slice();
 
@@ -327,7 +344,7 @@ impl<'d, T: BasicInstance> BufferedUartRx<'d, T> {
                 rx_reader.pop_done(len);
 
                 if do_pend {
-                    T::Interrupt::pend();
+                    self.info.interrupt.pend();
                 }
 
                 return Poll::Ready(Ok(len));
@@ -341,7 +358,7 @@ impl<'d, T: BasicInstance> BufferedUartRx<'d, T> {
 
     fn blocking_read(&self, buf: &mut [u8]) -> Result<usize, Error> {
         loop {
-            let state = T::buffered_state();
+            let state = self.state;
             let mut rx_reader = unsafe { state.rx_buf.reader() };
             let data = rx_reader.pop_slice();
 
@@ -353,7 +370,7 @@ impl<'d, T: BasicInstance> BufferedUartRx<'d, T> {
                 rx_reader.pop_done(len);
 
                 if do_pend {
-                    T::Interrupt::pend();
+                    self.info.interrupt.pend();
                 }
 
                 return Ok(len);
@@ -363,7 +380,7 @@ impl<'d, T: BasicInstance> BufferedUartRx<'d, T> {
 
     async fn fill_buf(&self) -> Result<&[u8], Error> {
         poll_fn(move |cx| {
-            let state = T::buffered_state();
+            let state = self.state;
             let mut rx_reader = unsafe { state.rx_buf.reader() };
             let (p, n) = rx_reader.pop_buf();
             if n == 0 {
@@ -378,20 +395,20 @@ impl<'d, T: BasicInstance> BufferedUartRx<'d, T> {
     }
 
     fn consume(&self, amt: usize) {
-        let state = T::buffered_state();
+        let state = self.state;
         let mut rx_reader = unsafe { state.rx_buf.reader() };
         let full = state.rx_buf.is_full();
         rx_reader.pop_done(amt);
         if full {
-            T::Interrupt::pend();
+            self.info.interrupt.pend();
         }
     }
 
     /// Reconfigure the driver
     pub fn set_config(&mut self, config: &Config) -> Result<(), ConfigError> {
-        reconfigure::<T>(config)?;
+        reconfigure(self.info, self.kernel_clock, config)?;
 
-        T::regs().cr1().modify(|w| {
+        self.info.regs.cr1().modify(|w| {
             w.set_rxneie(true);
             w.set_idleie(true);
         });
@@ -400,10 +417,10 @@ impl<'d, T: BasicInstance> BufferedUartRx<'d, T> {
     }
 }
 
-impl<'d, T: BasicInstance> BufferedUartTx<'d, T> {
+impl<'d> BufferedUartTx<'d> {
     async fn write(&self, buf: &[u8]) -> Result<usize, Error> {
         poll_fn(move |cx| {
-            let state = T::buffered_state();
+            let state = self.state;
             state.tx_done.store(false, Ordering::Release);
 
             let empty = state.tx_buf.is_empty();
@@ -420,7 +437,7 @@ impl<'d, T: BasicInstance> BufferedUartTx<'d, T> {
             tx_writer.push_done(n);
 
             if empty {
-                T::Interrupt::pend();
+                self.info.interrupt.pend();
             }
 
             Poll::Ready(Ok(n))
@@ -430,7 +447,7 @@ impl<'d, T: BasicInstance> BufferedUartTx<'d, T> {
 
     async fn flush(&self) -> Result<(), Error> {
         poll_fn(move |cx| {
-            let state = T::buffered_state();
+            let state = self.state;
 
             if !state.tx_done.load(Ordering::Acquire) {
                 state.tx_waker.register(cx.waker());
@@ -444,7 +461,7 @@ impl<'d, T: BasicInstance> BufferedUartTx<'d, T> {
 
     fn blocking_write(&self, buf: &[u8]) -> Result<usize, Error> {
         loop {
-            let state = T::buffered_state();
+            let state = self.state;
             let empty = state.tx_buf.is_empty();
 
             let mut tx_writer = unsafe { state.tx_buf.writer() };
@@ -455,7 +472,7 @@ impl<'d, T: BasicInstance> BufferedUartTx<'d, T> {
                 tx_writer.push_done(n);
 
                 if empty {
-                    T::Interrupt::pend();
+                    self.info.interrupt.pend();
                 }
 
                 return Ok(n);
@@ -465,7 +482,7 @@ impl<'d, T: BasicInstance> BufferedUartTx<'d, T> {
 
     fn blocking_flush(&self) -> Result<(), Error> {
         loop {
-            let state = T::buffered_state();
+            let state = self.state;
             if state.tx_buf.is_empty() {
                 return Ok(());
             }
@@ -474,9 +491,9 @@ impl<'d, T: BasicInstance> BufferedUartTx<'d, T> {
 
     /// Reconfigure the driver
     pub fn set_config(&mut self, config: &Config) -> Result<(), ConfigError> {
-        reconfigure::<T>(config)?;
+        reconfigure(self.info, self.kernel_clock, config)?;
 
-        T::regs().cr1().modify(|w| {
+        self.info.regs.cr1().modify(|w| {
             w.set_rxneie(true);
             w.set_idleie(true);
         });
@@ -485,65 +502,78 @@ impl<'d, T: BasicInstance> BufferedUartTx<'d, T> {
     }
 }
 
-impl<'d, T: BasicInstance> Drop for BufferedUartRx<'d, T> {
+impl<'d> Drop for BufferedUartRx<'d> {
     fn drop(&mut self) {
-        let state = T::buffered_state();
+        let state = self.state;
         unsafe {
             state.rx_buf.deinit();
 
             // TX is inactive if the the buffer is not available.
             // We can now unregister the interrupt handler
             if state.tx_buf.len() == 0 {
-                T::Interrupt::disable();
+                self.info.interrupt.disable();
             }
         }
 
-        T::disable();
+        drop_tx_rx(self.info, state);
     }
 }
 
-impl<'d, T: BasicInstance> Drop for BufferedUartTx<'d, T> {
+impl<'d> Drop for BufferedUartTx<'d> {
     fn drop(&mut self) {
-        let state = T::buffered_state();
+        let state = self.state;
         unsafe {
             state.tx_buf.deinit();
 
             // RX is inactive if the the buffer is not available.
             // We can now unregister the interrupt handler
             if state.rx_buf.len() == 0 {
-                T::Interrupt::disable();
+                self.info.interrupt.disable();
             }
         }
 
-        T::disable();
+        drop_tx_rx(self.info, state);
     }
 }
 
-impl<'d, T: BasicInstance> embedded_io_async::ErrorType for BufferedUart<'d, T> {
+fn drop_tx_rx(info: &Info, state: &State) {
+    // We cannot use atomic subtraction here, because it's not supported for all targets
+    let is_last_drop = critical_section::with(|_| {
+        let refcount = state.tx_rx_refcount.load(Ordering::Relaxed);
+        assert!(refcount >= 1);
+        state.tx_rx_refcount.store(refcount - 1, Ordering::Relaxed);
+        refcount == 1
+    });
+    if is_last_drop {
+        info.enable_bit.disable();
+    }
+}
+
+impl<'d> embedded_io_async::ErrorType for BufferedUart<'d> {
     type Error = Error;
 }
 
-impl<'d, T: BasicInstance> embedded_io_async::ErrorType for BufferedUartRx<'d, T> {
+impl<'d> embedded_io_async::ErrorType for BufferedUartRx<'d> {
     type Error = Error;
 }
 
-impl<'d, T: BasicInstance> embedded_io_async::ErrorType for BufferedUartTx<'d, T> {
+impl<'d> embedded_io_async::ErrorType for BufferedUartTx<'d> {
     type Error = Error;
 }
 
-impl<'d, T: BasicInstance> embedded_io_async::Read for BufferedUart<'d, T> {
+impl<'d> embedded_io_async::Read for BufferedUart<'d> {
     async fn read(&mut self, buf: &mut [u8]) -> Result<usize, Self::Error> {
         self.rx.read(buf).await
     }
 }
 
-impl<'d, T: BasicInstance> embedded_io_async::Read for BufferedUartRx<'d, T> {
+impl<'d> embedded_io_async::Read for BufferedUartRx<'d> {
     async fn read(&mut self, buf: &mut [u8]) -> Result<usize, Self::Error> {
         Self::read(self, buf).await
     }
 }
 
-impl<'d, T: BasicInstance> embedded_io_async::BufRead for BufferedUart<'d, T> {
+impl<'d> embedded_io_async::BufRead for BufferedUart<'d> {
     async fn fill_buf(&mut self) -> Result<&[u8], Self::Error> {
         self.rx.fill_buf().await
     }
@@ -553,7 +583,7 @@ impl<'d, T: BasicInstance> embedded_io_async::BufRead for BufferedUart<'d, T> {
     }
 }
 
-impl<'d, T: BasicInstance> embedded_io_async::BufRead for BufferedUartRx<'d, T> {
+impl<'d> embedded_io_async::BufRead for BufferedUartRx<'d> {
     async fn fill_buf(&mut self) -> Result<&[u8], Self::Error> {
         Self::fill_buf(self).await
     }
@@ -563,7 +593,7 @@ impl<'d, T: BasicInstance> embedded_io_async::BufRead for BufferedUartRx<'d, T>
     }
 }
 
-impl<'d, T: BasicInstance> embedded_io_async::Write for BufferedUart<'d, T> {
+impl<'d> embedded_io_async::Write for BufferedUart<'d> {
     async fn write(&mut self, buf: &[u8]) -> Result<usize, Self::Error> {
         self.tx.write(buf).await
     }
@@ -573,7 +603,7 @@ impl<'d, T: BasicInstance> embedded_io_async::Write for BufferedUart<'d, T> {
     }
 }
 
-impl<'d, T: BasicInstance> embedded_io_async::Write for BufferedUartTx<'d, T> {
+impl<'d> embedded_io_async::Write for BufferedUartTx<'d> {
     async fn write(&mut self, buf: &[u8]) -> Result<usize, Self::Error> {
         Self::write(self, buf).await
     }
@@ -583,19 +613,19 @@ impl<'d, T: BasicInstance> embedded_io_async::Write for BufferedUartTx<'d, T> {
     }
 }
 
-impl<'d, T: BasicInstance> embedded_io::Read for BufferedUart<'d, T> {
+impl<'d> embedded_io::Read for BufferedUart<'d> {
     fn read(&mut self, buf: &mut [u8]) -> Result<usize, Self::Error> {
         self.rx.blocking_read(buf)
     }
 }
 
-impl<'d, T: BasicInstance> embedded_io::Read for BufferedUartRx<'d, T> {
+impl<'d> embedded_io::Read for BufferedUartRx<'d> {
     fn read(&mut self, buf: &mut [u8]) -> Result<usize, Self::Error> {
         self.blocking_read(buf)
     }
 }
 
-impl<'d, T: BasicInstance> embedded_io::Write for BufferedUart<'d, T> {
+impl<'d> embedded_io::Write for BufferedUart<'d> {
     fn write(&mut self, buf: &[u8]) -> Result<usize, Self::Error> {
         self.tx.blocking_write(buf)
     }
@@ -605,7 +635,7 @@ impl<'d, T: BasicInstance> embedded_io::Write for BufferedUart<'d, T> {
     }
 }
 
-impl<'d, T: BasicInstance> embedded_io::Write for BufferedUartTx<'d, T> {
+impl<'d> embedded_io::Write for BufferedUartTx<'d> {
     fn write(&mut self, buf: &[u8]) -> Result<usize, Self::Error> {
         Self::blocking_write(self, buf)
     }
@@ -615,11 +645,11 @@ impl<'d, T: BasicInstance> embedded_io::Write for BufferedUartTx<'d, T> {
     }
 }
 
-impl<'d, T: BasicInstance> embedded_hal_02::serial::Read<u8> for BufferedUartRx<'d, T> {
+impl<'d> embedded_hal_02::serial::Read<u8> for BufferedUartRx<'d> {
     type Error = Error;
 
     fn read(&mut self) -> Result<u8, nb::Error<Self::Error>> {
-        let r = T::regs();
+        let r = self.info.regs;
         unsafe {
             let sr = sr(r).read();
             if sr.pe() {
@@ -643,7 +673,7 @@ impl<'d, T: BasicInstance> embedded_hal_02::serial::Read<u8> for BufferedUartRx<
     }
 }
 
-impl<'d, T: BasicInstance> embedded_hal_02::blocking::serial::Write<u8> for BufferedUartTx<'d, T> {
+impl<'d> embedded_hal_02::blocking::serial::Write<u8> for BufferedUartTx<'d> {
     type Error = Error;
 
     fn bwrite_all(&mut self, mut buffer: &[u8]) -> Result<(), Self::Error> {
@@ -662,7 +692,7 @@ impl<'d, T: BasicInstance> embedded_hal_02::blocking::serial::Write<u8> for Buff
     }
 }
 
-impl<'d, T: BasicInstance> embedded_hal_02::serial::Read<u8> for BufferedUart<'d, T> {
+impl<'d> embedded_hal_02::serial::Read<u8> for BufferedUart<'d> {
     type Error = Error;
 
     fn read(&mut self) -> Result<u8, nb::Error<Self::Error>> {
@@ -670,7 +700,7 @@ impl<'d, T: BasicInstance> embedded_hal_02::serial::Read<u8> for BufferedUart<'d
     }
 }
 
-impl<'d, T: BasicInstance> embedded_hal_02::blocking::serial::Write<u8> for BufferedUart<'d, T> {
+impl<'d> embedded_hal_02::blocking::serial::Write<u8> for BufferedUart<'d> {
     type Error = Error;
 
     fn bwrite_all(&mut self, mut buffer: &[u8]) -> Result<(), Self::Error> {
@@ -689,25 +719,25 @@ impl<'d, T: BasicInstance> embedded_hal_02::blocking::serial::Write<u8> for Buff
     }
 }
 
-impl<'d, T: BasicInstance> embedded_hal_nb::serial::ErrorType for BufferedUart<'d, T> {
+impl<'d> embedded_hal_nb::serial::ErrorType for BufferedUart<'d> {
     type Error = Error;
 }
 
-impl<'d, T: BasicInstance> embedded_hal_nb::serial::ErrorType for BufferedUartTx<'d, T> {
+impl<'d> embedded_hal_nb::serial::ErrorType for BufferedUartTx<'d> {
     type Error = Error;
 }
 
-impl<'d, T: BasicInstance> embedded_hal_nb::serial::ErrorType for BufferedUartRx<'d, T> {
+impl<'d> embedded_hal_nb::serial::ErrorType for BufferedUartRx<'d> {
     type Error = Error;
 }
 
-impl<'d, T: BasicInstance> embedded_hal_nb::serial::Read for BufferedUartRx<'d, T> {
+impl<'d> embedded_hal_nb::serial::Read for BufferedUartRx<'d> {
     fn read(&mut self) -> nb::Result<u8, Self::Error> {
         embedded_hal_02::serial::Read::read(self)
     }
 }
 
-impl<'d, T: BasicInstance> embedded_hal_nb::serial::Write for BufferedUartTx<'d, T> {
+impl<'d> embedded_hal_nb::serial::Write for BufferedUartTx<'d> {
     fn write(&mut self, char: u8) -> nb::Result<(), Self::Error> {
         self.blocking_write(&[char]).map(drop).map_err(nb::Error::Other)
     }
@@ -717,13 +747,13 @@ impl<'d, T: BasicInstance> embedded_hal_nb::serial::Write for BufferedUartTx<'d,
     }
 }
 
-impl<'d, T: BasicInstance> embedded_hal_nb::serial::Read for BufferedUart<'d, T> {
+impl<'d> embedded_hal_nb::serial::Read for BufferedUart<'d> {
     fn read(&mut self) -> Result<u8, nb::Error<Self::Error>> {
         embedded_hal_02::serial::Read::read(&mut self.rx)
     }
 }
 
-impl<'d, T: BasicInstance> embedded_hal_nb::serial::Write for BufferedUart<'d, T> {
+impl<'d> embedded_hal_nb::serial::Write for BufferedUart<'d> {
     fn write(&mut self, char: u8) -> nb::Result<(), Self::Error> {
         self.tx.blocking_write(&[char]).map(drop).map_err(nb::Error::Other)
     }
diff --git a/embassy-stm32/src/usart/mod.rs b/embassy-stm32/src/usart/mod.rs
index a6dfbd482..b24335f3a 100644
--- a/embassy-stm32/src/usart/mod.rs
+++ b/embassy-stm32/src/usart/mod.rs
@@ -4,7 +4,7 @@
 
 use core::future::poll_fn;
 use core::marker::PhantomData;
-use core::sync::atomic::{compiler_fence, Ordering};
+use core::sync::atomic::{compiler_fence, AtomicU8, Ordering};
 use core::task::Poll;
 
 use embassy_embedded_hal::SetConfig;
@@ -15,7 +15,8 @@ use futures_util::future::{select, Either};
 
 use crate::dma::ChannelAndRequest;
 use crate::gpio::{AFType, AnyPin, SealedPin};
-use crate::interrupt::typelevel::Interrupt;
+use crate::interrupt::typelevel::Interrupt as _;
+use crate::interrupt::{self, Interrupt, InterruptExt};
 use crate::mode::{Async, Blocking, Mode};
 #[allow(unused_imports)]
 #[cfg(not(any(usart_v1, usart_v2)))]
@@ -27,57 +28,59 @@ use crate::pac::usart::Lpuart as Regs;
 #[cfg(any(usart_v1, usart_v2))]
 use crate::pac::usart::Usart as Regs;
 use crate::pac::usart::{regs, vals};
+use crate::rcc::{ClockEnableBit, SealedRccPeripheral};
 use crate::time::Hertz;
-use crate::{interrupt, peripherals, Peripheral};
+use crate::Peripheral;
 
 /// Interrupt handler.
-pub struct InterruptHandler<T: BasicInstance> {
+pub struct InterruptHandler<T: Instance> {
     _phantom: PhantomData<T>,
 }
 
-impl<T: BasicInstance> interrupt::typelevel::Handler<T::Interrupt> for InterruptHandler<T> {
+impl<T: Instance> interrupt::typelevel::Handler<T::Interrupt> for InterruptHandler<T> {
     unsafe fn on_interrupt() {
-        let r = T::regs();
-        let s = T::state();
-
-        let (sr, cr1, cr3) = (sr(r).read(), r.cr1().read(), r.cr3().read());
-
-        let has_errors = (sr.pe() && cr1.peie()) || ((sr.fe() || sr.ne() || sr.ore()) && cr3.eie());
-        if has_errors {
-            // clear all interrupts and DMA Rx Request
-            r.cr1().modify(|w| {
-                // disable RXNE interrupt
-                w.set_rxneie(false);
-                // disable parity interrupt
-                w.set_peie(false);
-                // disable idle line interrupt
-                w.set_idleie(false);
-            });
-            r.cr3().modify(|w| {
-                // disable Error Interrupt: (Frame error, Noise error, Overrun error)
-                w.set_eie(false);
-                // disable DMA Rx Request
-                w.set_dmar(false);
-            });
-        } else if cr1.idleie() && sr.idle() {
-            // IDLE detected: no more data will come
-            r.cr1().modify(|w| {
-                // disable idle line detection
-                w.set_idleie(false);
-            });
-        } else if cr1.rxneie() {
-            // We cannot check the RXNE flag as it is auto-cleared by the DMA controller
-
-            // It is up to the listener to determine if this in fact was a RX event and disable the RXNE detection
-        } else {
-            return;
-        }
-
-        compiler_fence(Ordering::SeqCst);
-        s.rx_waker.wake();
+        on_interrupt(T::info().regs, T::state())
     }
 }
 
+unsafe fn on_interrupt(r: Regs, s: &'static State) {
+    let (sr, cr1, cr3) = (sr(r).read(), r.cr1().read(), r.cr3().read());
+
+    let has_errors = (sr.pe() && cr1.peie()) || ((sr.fe() || sr.ne() || sr.ore()) && cr3.eie());
+    if has_errors {
+        // clear all interrupts and DMA Rx Request
+        r.cr1().modify(|w| {
+            // disable RXNE interrupt
+            w.set_rxneie(false);
+            // disable parity interrupt
+            w.set_peie(false);
+            // disable idle line interrupt
+            w.set_idleie(false);
+        });
+        r.cr3().modify(|w| {
+            // disable Error Interrupt: (Frame error, Noise error, Overrun error)
+            w.set_eie(false);
+            // disable DMA Rx Request
+            w.set_dmar(false);
+        });
+    } else if cr1.idleie() && sr.idle() {
+        // IDLE detected: no more data will come
+        r.cr1().modify(|w| {
+            // disable idle line detection
+            w.set_idleie(false);
+        });
+    } else if cr1.rxneie() {
+        // We cannot check the RXNE flag as it is auto-cleared by the DMA controller
+
+        // It is up to the listener to determine if this in fact was a RX event and disable the RXNE detection
+    } else {
+        return;
+    }
+
+    compiler_fence(Ordering::SeqCst);
+    s.rx_waker.wake();
+}
+
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 #[cfg_attr(feature = "defmt", derive(defmt::Format))]
 /// Number of data bits
@@ -239,12 +242,12 @@ enum ReadCompletionEvent {
 ///
 /// See [`UartRx`] for more details, and see [`BufferedUart`] and [`RingBufferedUartRx`]
 /// as alternatives that do provide the necessary guarantees for `embedded_io::Read`.
-pub struct Uart<'d, T: BasicInstance, M: Mode> {
-    tx: UartTx<'d, T, M>,
-    rx: UartRx<'d, T, M>,
+pub struct Uart<'d, M: Mode> {
+    tx: UartTx<'d, M>,
+    rx: UartRx<'d, M>,
 }
 
-impl<'d, T: BasicInstance, M: Mode> SetConfig for Uart<'d, T, M> {
+impl<'d, M: Mode> SetConfig for Uart<'d, M> {
     type Config = Config;
     type ConfigError = ConfigError;
 
@@ -258,15 +261,18 @@ impl<'d, T: BasicInstance, M: Mode> SetConfig for Uart<'d, T, M> {
 ///
 /// Can be obtained from [`Uart::split`], or can be constructed independently,
 /// if you do not need the receiving half of the driver.
-pub struct UartTx<'d, T: BasicInstance, M: Mode> {
-    _phantom: PhantomData<(T, M)>,
+pub struct UartTx<'d, M: Mode> {
+    info: &'static Info,
+    state: &'static State,
+    kernel_clock: Hertz,
     tx: Option<PeripheralRef<'d, AnyPin>>,
     cts: Option<PeripheralRef<'d, AnyPin>>,
     de: Option<PeripheralRef<'d, AnyPin>>,
     tx_dma: Option<ChannelAndRequest<'d>>,
+    _phantom: PhantomData<M>,
 }
 
-impl<'d, T: BasicInstance, M: Mode> SetConfig for UartTx<'d, T, M> {
+impl<'d, M: Mode> SetConfig for UartTx<'d, M> {
     type Config = Config;
     type ConfigError = ConfigError;
 
@@ -304,17 +310,20 @@ impl<'d, T: BasicInstance, M: Mode> SetConfig for UartTx<'d, T, M> {
 /// store data received between calls.
 ///
 /// Also see [this github comment](https://github.com/embassy-rs/embassy/pull/2185#issuecomment-1810047043).
-pub struct UartRx<'d, T: BasicInstance, M: Mode> {
-    _phantom: PhantomData<(T, M)>,
+pub struct UartRx<'d, M: Mode> {
+    info: &'static Info,
+    state: &'static State,
+    kernel_clock: Hertz,
     rx: Option<PeripheralRef<'d, AnyPin>>,
     rts: Option<PeripheralRef<'d, AnyPin>>,
     rx_dma: Option<ChannelAndRequest<'d>>,
     detect_previous_overrun: bool,
     #[cfg(any(usart_v1, usart_v2))]
     buffered_sr: stm32_metapac::usart::regs::Sr,
+    _phantom: PhantomData<M>,
 }
 
-impl<'d, T: BasicInstance, M: Mode> SetConfig for UartRx<'d, T, M> {
+impl<'d, M: Mode> SetConfig for UartRx<'d, M> {
     type Config = Config;
     type ConfigError = ConfigError;
 
@@ -323,9 +332,9 @@ impl<'d, T: BasicInstance, M: Mode> SetConfig for UartRx<'d, T, M> {
     }
 }
 
-impl<'d, T: BasicInstance> UartTx<'d, T, Async> {
+impl<'d> UartTx<'d, Async> {
     /// Useful if you only want Uart Tx. It saves 1 pin and consumes a little less power.
-    pub fn new(
+    pub fn new<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         tx: impl Peripheral<P = impl TxPin<T>> + 'd,
         tx_dma: impl Peripheral<P = impl TxDma<T>> + 'd,
@@ -341,7 +350,7 @@ impl<'d, T: BasicInstance> UartTx<'d, T, Async> {
     }
 
     /// Create a new tx-only UART with a clear-to-send pin
-    pub fn new_with_cts(
+    pub fn new_with_cts<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         tx: impl Peripheral<P = impl TxPin<T>> + 'd,
         cts: impl Peripheral<P = impl CtsPin<T>> + 'd,
@@ -359,7 +368,7 @@ impl<'d, T: BasicInstance> UartTx<'d, T, Async> {
 
     /// Initiate an asynchronous UART write
     pub async fn write(&mut self, buffer: &[u8]) -> Result<(), Error> {
-        let r = T::regs();
+        let r = self.info.regs;
 
         // Disable Receiver for Half-Duplex mode
         if r.cr3().read().hdsel() {
@@ -377,21 +386,17 @@ impl<'d, T: BasicInstance> UartTx<'d, T, Async> {
         Ok(())
     }
 
-    async fn flush_inner() -> Result<(), Error> {
-        Self::blocking_flush_inner()
-    }
-
     /// Wait until transmission complete
     pub async fn flush(&mut self) -> Result<(), Error> {
-        Self::flush_inner().await
+        self.blocking_flush()
     }
 }
 
-impl<'d, T: BasicInstance> UartTx<'d, T, Blocking> {
+impl<'d> UartTx<'d, Blocking> {
     /// Create a new blocking tx-only UART with no hardware flow control.
     ///
     /// Useful if you only want Uart Tx. It saves 1 pin and consumes a little less power.
-    pub fn new_blocking(
+    pub fn new_blocking<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         tx: impl Peripheral<P = impl TxPin<T>> + 'd,
         config: Config,
@@ -400,7 +405,7 @@ impl<'d, T: BasicInstance> UartTx<'d, T, Blocking> {
     }
 
     /// Create a new blocking tx-only UART with a clear-to-send pin
-    pub fn new_blocking_with_cts(
+    pub fn new_blocking_with_cts<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         tx: impl Peripheral<P = impl TxPin<T>> + 'd,
         cts: impl Peripheral<P = impl CtsPin<T>> + 'd,
@@ -416,8 +421,8 @@ impl<'d, T: BasicInstance> UartTx<'d, T, Blocking> {
     }
 }
 
-impl<'d, T: BasicInstance, M: Mode> UartTx<'d, T, M> {
-    fn new_inner(
+impl<'d, M: Mode> UartTx<'d, M> {
+    fn new_inner<T: Instance>(
         _peri: impl Peripheral<P = T> + 'd,
         tx: Option<PeripheralRef<'d, AnyPin>>,
         cts: Option<PeripheralRef<'d, AnyPin>>,
@@ -426,16 +431,21 @@ impl<'d, T: BasicInstance, M: Mode> UartTx<'d, T, M> {
     ) -> Result<Self, ConfigError> {
         T::enable_and_reset();
 
-        let r = T::regs();
+        let info = T::info();
+        let state = T::state();
+        let kernel_clock = T::frequency();
+        let r = info.regs;
         r.cr3().modify(|w| {
             w.set_ctse(cts.is_some());
         });
-        configure(r, &config, T::frequency(), T::KIND, false, true)?;
+        configure(info, kernel_clock, &config, false, true)?;
 
-        // create state once!
-        let _s = T::state();
+        state.tx_rx_refcount.store(1, Ordering::Relaxed);
 
         Ok(Self {
+            info,
+            state,
+            kernel_clock,
             tx,
             cts,
             de: None,
@@ -446,12 +456,12 @@ impl<'d, T: BasicInstance, M: Mode> UartTx<'d, T, M> {
 
     /// Reconfigure the driver
     pub fn set_config(&mut self, config: &Config) -> Result<(), ConfigError> {
-        reconfigure::<T>(config)
+        reconfigure(self.info, self.kernel_clock, config)
     }
 
     /// Perform a blocking UART write
     pub fn blocking_write(&mut self, buffer: &[u8]) -> Result<(), Error> {
-        let r = T::regs();
+        let r = self.info.regs;
 
         // Disable Receiver for Half-Duplex mode
         if r.cr3().read().hdsel() {
@@ -465,28 +475,29 @@ impl<'d, T: BasicInstance, M: Mode> UartTx<'d, T, M> {
         Ok(())
     }
 
-    fn blocking_flush_inner() -> Result<(), Error> {
-        let r = T::regs();
-        while !sr(r).read().tc() {}
-
-        // Enable Receiver after transmission complete for Half-Duplex mode
-        if r.cr3().read().hdsel() {
-            r.cr1().modify(|reg| reg.set_re(true));
-        }
-        Ok(())
-    }
-
     /// Block until transmission complete
     pub fn blocking_flush(&mut self) -> Result<(), Error> {
-        Self::blocking_flush_inner()
+        blocking_flush(self.info)
     }
 }
 
-impl<'d, T: BasicInstance> UartRx<'d, T, Async> {
+fn blocking_flush(info: &Info) -> Result<(), Error> {
+    let r = info.regs;
+    while !sr(r).read().tc() {}
+
+    // Enable Receiver after transmission complete for Half-Duplex mode
+    if r.cr3().read().hdsel() {
+        r.cr1().modify(|reg| reg.set_re(true));
+    }
+
+    Ok(())
+}
+
+impl<'d> UartRx<'d, Async> {
     /// Create a new rx-only UART with no hardware flow control.
     ///
     /// Useful if you only want Uart Rx. It saves 1 pin and consumes a little less power.
-    pub fn new(
+    pub fn new<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         _irq: impl interrupt::typelevel::Binding<T::Interrupt, InterruptHandler<T>> + 'd,
         rx: impl Peripheral<P = impl RxPin<T>> + 'd,
@@ -497,7 +508,7 @@ impl<'d, T: BasicInstance> UartRx<'d, T, Async> {
     }
 
     /// Create a new rx-only UART with a request-to-send pin
-    pub fn new_with_rts(
+    pub fn new_with_rts<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         _irq: impl interrupt::typelevel::Binding<T::Interrupt, InterruptHandler<T>> + 'd,
         rx: impl Peripheral<P = impl RxPin<T>> + 'd,
@@ -531,11 +542,11 @@ impl<'d, T: BasicInstance> UartRx<'d, T, Async> {
         buffer: &mut [u8],
         enable_idle_line_detection: bool,
     ) -> Result<ReadCompletionEvent, Error> {
-        let r = T::regs();
+        let r = self.info.regs;
 
         // Call flush for Half-Duplex mode. It prevents reading of bytes which have just been written.
         if r.cr3().read().hdsel() {
-            UartTx::<'d, T, Async>::flush_inner().await?;
+            blocking_flush(self.info)?;
         }
 
         // make sure USART state is restored to neutral state when this future is dropped
@@ -565,7 +576,7 @@ impl<'d, T: BasicInstance> UartRx<'d, T, Async> {
         // Start USART DMA
         // will not do anything yet because DMAR is not yet set
         // future which will complete when DMA Read request completes
-        let transfer = unsafe { ch.read(rdr(T::regs()), buffer, Default::default()) };
+        let transfer = unsafe { ch.read(rdr(r), buffer, Default::default()) };
 
         // clear ORE flag just before enabling DMA Rx Request: can be mandatory for the second transfer
         if !self.detect_previous_overrun {
@@ -640,9 +651,8 @@ impl<'d, T: BasicInstance> UartRx<'d, T, Async> {
         compiler_fence(Ordering::SeqCst);
 
         // future which completes when idle line or error is detected
+        let s = self.state;
         let abort = poll_fn(move |cx| {
-            let s = T::state();
-
             s.rx_waker.register(cx.waker());
 
             let sr = sr(r).read();
@@ -728,11 +738,11 @@ impl<'d, T: BasicInstance> UartRx<'d, T, Async> {
     }
 }
 
-impl<'d, T: BasicInstance> UartRx<'d, T, Blocking> {
+impl<'d> UartRx<'d, Blocking> {
     /// Create a new rx-only UART with no hardware flow control.
     ///
     /// Useful if you only want Uart Rx. It saves 1 pin and consumes a little less power.
-    pub fn new_blocking(
+    pub fn new_blocking<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         rx: impl Peripheral<P = impl RxPin<T>> + 'd,
         config: Config,
@@ -741,7 +751,7 @@ impl<'d, T: BasicInstance> UartRx<'d, T, Blocking> {
     }
 
     /// Create a new rx-only UART with a request-to-send pin
-    pub fn new_blocking_with_rts(
+    pub fn new_blocking_with_rts<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         rx: impl Peripheral<P = impl RxPin<T>> + 'd,
         rts: impl Peripheral<P = impl RtsPin<T>> + 'd,
@@ -757,8 +767,8 @@ impl<'d, T: BasicInstance> UartRx<'d, T, Blocking> {
     }
 }
 
-impl<'d, T: BasicInstance, M: Mode> UartRx<'d, T, M> {
-    fn new_inner(
+impl<'d, M: Mode> UartRx<'d, M> {
+    fn new_inner<T: Instance>(
         _peri: impl Peripheral<P = T> + 'd,
         rx: Option<PeripheralRef<'d, AnyPin>>,
         rts: Option<PeripheralRef<'d, AnyPin>>,
@@ -767,20 +777,25 @@ impl<'d, T: BasicInstance, M: Mode> UartRx<'d, T, M> {
     ) -> Result<Self, ConfigError> {
         T::enable_and_reset();
 
-        let r = T::regs();
+        let info = T::info();
+        let state = T::state();
+        let kernel_clock = T::frequency();
+        let r = info.regs;
         r.cr3().write(|w| {
             w.set_rtse(rts.is_some());
         });
-        configure(r, &config, T::frequency(), T::KIND, true, false)?;
+        configure(info, kernel_clock, &config, true, false)?;
 
         T::Interrupt::unpend();
         unsafe { T::Interrupt::enable() };
 
-        // create state once!
-        let _s = T::state();
+        state.tx_rx_refcount.store(1, Ordering::Relaxed);
 
         Ok(Self {
             _phantom: PhantomData,
+            info,
+            state,
+            kernel_clock,
             rx,
             rts,
             rx_dma,
@@ -792,12 +807,12 @@ impl<'d, T: BasicInstance, M: Mode> UartRx<'d, T, M> {
 
     /// Reconfigure the driver
     pub fn set_config(&mut self, config: &Config) -> Result<(), ConfigError> {
-        reconfigure::<T>(config)
+        reconfigure(self.info, self.kernel_clock, config)
     }
 
     #[cfg(any(usart_v1, usart_v2))]
     fn check_rx_flags(&mut self) -> Result<bool, Error> {
-        let r = T::regs();
+        let r = self.info.regs;
         loop {
             // Handle all buffered error flags.
             if self.buffered_sr.pe() {
@@ -830,7 +845,7 @@ impl<'d, T: BasicInstance, M: Mode> UartRx<'d, T, M> {
 
     #[cfg(any(usart_v3, usart_v4))]
     fn check_rx_flags(&mut self) -> Result<bool, Error> {
-        let r = T::regs();
+        let r = self.info.regs;
         let sr = r.isr().read();
         if sr.pe() {
             r.icr().write(|w| w.set_pe(true));
@@ -850,7 +865,7 @@ impl<'d, T: BasicInstance, M: Mode> UartRx<'d, T, M> {
 
     /// Read a single u8 if there is one available, otherwise return WouldBlock
     pub(crate) fn nb_read(&mut self) -> Result<u8, nb::Error<Error>> {
-        let r = T::regs();
+        let r = self.info.regs;
         if self.check_rx_flags()? {
             Ok(unsafe { rdr(r).read_volatile() })
         } else {
@@ -860,11 +875,11 @@ impl<'d, T: BasicInstance, M: Mode> UartRx<'d, T, M> {
 
     /// Perform a blocking read into `buffer`
     pub fn blocking_read(&mut self, buffer: &mut [u8]) -> Result<(), Error> {
-        let r = T::regs();
+        let r = self.info.regs;
 
         // Call flush for Half-Duplex mode. It prevents reading of bytes which have just been written.
         if r.cr3().read().hdsel() {
-            UartTx::<'d, T, M>::blocking_flush_inner()?;
+            blocking_flush(self.info)?;
         }
 
         for b in buffer {
@@ -875,26 +890,39 @@ impl<'d, T: BasicInstance, M: Mode> UartRx<'d, T, M> {
     }
 }
 
-impl<'d, T: BasicInstance, M: Mode> Drop for UartTx<'d, T, M> {
+impl<'d, M: Mode> Drop for UartTx<'d, M> {
     fn drop(&mut self) {
         self.tx.as_ref().map(|x| x.set_as_disconnected());
         self.cts.as_ref().map(|x| x.set_as_disconnected());
         self.de.as_ref().map(|x| x.set_as_disconnected());
-        T::disable();
+        drop_tx_rx(self.info, self.state);
     }
 }
 
-impl<'d, T: BasicInstance, M: Mode> Drop for UartRx<'d, T, M> {
+impl<'d, M: Mode> Drop for UartRx<'d, M> {
     fn drop(&mut self) {
         self.rx.as_ref().map(|x| x.set_as_disconnected());
         self.rts.as_ref().map(|x| x.set_as_disconnected());
-        T::disable();
+        drop_tx_rx(self.info, self.state);
     }
 }
 
-impl<'d, T: BasicInstance> Uart<'d, T, Async> {
+fn drop_tx_rx(info: &Info, state: &State) {
+    // We cannot use atomic subtraction here, because it's not supported for all targets
+    let is_last_drop = critical_section::with(|_| {
+        let refcount = state.tx_rx_refcount.load(Ordering::Relaxed);
+        assert!(refcount >= 1);
+        state.tx_rx_refcount.store(refcount - 1, Ordering::Relaxed);
+        refcount == 1
+    });
+    if is_last_drop {
+        info.enable_bit.disable();
+    }
+}
+
+impl<'d> Uart<'d, Async> {
     /// Create a new bidirectional UART
-    pub fn new(
+    pub fn new<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         rx: impl Peripheral<P = impl RxPin<T>> + 'd,
         tx: impl Peripheral<P = impl TxPin<T>> + 'd,
@@ -917,7 +945,7 @@ impl<'d, T: BasicInstance> Uart<'d, T, Async> {
     }
 
     /// Create a new bidirectional UART with request-to-send and clear-to-send pins
-    pub fn new_with_rtscts(
+    pub fn new_with_rtscts<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         rx: impl Peripheral<P = impl RxPin<T>> + 'd,
         tx: impl Peripheral<P = impl TxPin<T>> + 'd,
@@ -943,7 +971,7 @@ impl<'d, T: BasicInstance> Uart<'d, T, Async> {
 
     #[cfg(not(any(usart_v1, usart_v2)))]
     /// Create a new bidirectional UART with a driver-enable pin
-    pub fn new_with_de(
+    pub fn new_with_de<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         rx: impl Peripheral<P = impl RxPin<T>> + 'd,
         tx: impl Peripheral<P = impl TxPin<T>> + 'd,
@@ -977,7 +1005,7 @@ impl<'d, T: BasicInstance> Uart<'d, T, Async> {
     /// Apart from this, the communication protocol is similar to normal USART mode. Any conflict
     /// on the line must be managed by software (for instance by using a centralized arbiter).
     #[doc(alias("HDSEL"))]
-    pub fn new_half_duplex(
+    pub fn new_half_duplex<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         tx: impl Peripheral<P = impl TxPin<T>> + 'd,
         _irq: impl interrupt::typelevel::Binding<T::Interrupt, InterruptHandler<T>> + 'd,
@@ -1015,7 +1043,7 @@ impl<'d, T: BasicInstance> Uart<'d, T, Async> {
     /// on the line must be managed by software (for instance by using a centralized arbiter).
     #[cfg(not(any(usart_v1, usart_v2)))]
     #[doc(alias("HDSEL"))]
-    pub fn new_half_duplex_on_rx(
+    pub fn new_half_duplex_on_rx<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         rx: impl Peripheral<P = impl RxPin<T>> + 'd,
         _irq: impl interrupt::typelevel::Binding<T::Interrupt, InterruptHandler<T>> + 'd,
@@ -1055,9 +1083,9 @@ impl<'d, T: BasicInstance> Uart<'d, T, Async> {
     }
 }
 
-impl<'d, T: BasicInstance> Uart<'d, T, Blocking> {
+impl<'d> Uart<'d, Blocking> {
     /// Create a new blocking bidirectional UART.
-    pub fn new_blocking(
+    pub fn new_blocking<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         rx: impl Peripheral<P = impl RxPin<T>> + 'd,
         tx: impl Peripheral<P = impl TxPin<T>> + 'd,
@@ -1077,7 +1105,7 @@ impl<'d, T: BasicInstance> Uart<'d, T, Blocking> {
     }
 
     /// Create a new bidirectional UART with request-to-send and clear-to-send pins
-    pub fn new_blocking_with_rtscts(
+    pub fn new_blocking_with_rtscts<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         rx: impl Peripheral<P = impl RxPin<T>> + 'd,
         tx: impl Peripheral<P = impl TxPin<T>> + 'd,
@@ -1100,7 +1128,7 @@ impl<'d, T: BasicInstance> Uart<'d, T, Blocking> {
 
     #[cfg(not(any(usart_v1, usart_v2)))]
     /// Create a new bidirectional UART with a driver-enable pin
-    pub fn new_blocking_with_de(
+    pub fn new_blocking_with_de<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         rx: impl Peripheral<P = impl RxPin<T>> + 'd,
         tx: impl Peripheral<P = impl TxPin<T>> + 'd,
@@ -1131,7 +1159,7 @@ impl<'d, T: BasicInstance> Uart<'d, T, Blocking> {
     /// Apart from this, the communication protocol is similar to normal USART mode. Any conflict
     /// on the line must be managed by software (for instance by using a centralized arbiter).
     #[doc(alias("HDSEL"))]
-    pub fn new_blocking_half_duplex(
+    pub fn new_blocking_half_duplex<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         tx: impl Peripheral<P = impl TxPin<T>> + 'd,
         mut config: Config,
@@ -1166,7 +1194,7 @@ impl<'d, T: BasicInstance> Uart<'d, T, Blocking> {
     /// on the line must be managed by software (for instance by using a centralized arbiter).
     #[cfg(not(any(usart_v1, usart_v2)))]
     #[doc(alias("HDSEL"))]
-    pub fn new_blocking_half_duplex_on_rx(
+    pub fn new_blocking_half_duplex_on_rx<T: Instance>(
         peri: impl Peripheral<P = T> + 'd,
         rx: impl Peripheral<P = impl RxPin<T>> + 'd,
         mut config: Config,
@@ -1188,8 +1216,8 @@ impl<'d, T: BasicInstance> Uart<'d, T, Blocking> {
     }
 }
 
-impl<'d, T: BasicInstance, M: Mode> Uart<'d, T, M> {
-    fn new_inner(
+impl<'d, M: Mode> Uart<'d, M> {
+    fn new_inner<T: Instance>(
         _peri: impl Peripheral<P = T> + 'd,
         rx: Option<PeripheralRef<'d, AnyPin>>,
         tx: Option<PeripheralRef<'d, AnyPin>>,
@@ -1200,11 +1228,12 @@ impl<'d, T: BasicInstance, M: Mode> Uart<'d, T, M> {
         rx_dma: Option<ChannelAndRequest<'d>>,
         config: Config,
     ) -> Result<Self, ConfigError> {
-        // UartRx and UartTx have one refcount each.
-        T::enable_and_reset();
         T::enable_and_reset();
 
-        let r = T::regs();
+        let info = T::info();
+        let state = T::state();
+        let kernel_clock = T::frequency();
+        let r = info.regs;
 
         r.cr3().write(|w| {
             w.set_rtse(rts.is_some());
@@ -1212,17 +1241,19 @@ impl<'d, T: BasicInstance, M: Mode> Uart<'d, T, M> {
             #[cfg(not(any(usart_v1, usart_v2)))]
             w.set_dem(de.is_some());
         });
-        configure(r, &config, T::frequency(), T::KIND, true, true)?;
+        configure(info, kernel_clock, &config, true, true)?;
 
         T::Interrupt::unpend();
         unsafe { T::Interrupt::enable() };
 
-        // create state once!
-        let _s = T::state();
+        state.tx_rx_refcount.store(2, Ordering::Relaxed);
 
         Ok(Self {
             tx: UartTx {
                 _phantom: PhantomData,
+                info,
+                state,
+                kernel_clock,
                 tx,
                 cts,
                 de,
@@ -1230,6 +1261,9 @@ impl<'d, T: BasicInstance, M: Mode> Uart<'d, T, M> {
             },
             rx: UartRx {
                 _phantom: PhantomData,
+                info,
+                state,
+                kernel_clock,
                 rx,
                 rts,
                 rx_dma,
@@ -1263,32 +1297,34 @@ impl<'d, T: BasicInstance, M: Mode> Uart<'d, T, M> {
     /// Split the Uart into a transmitter and receiver, which is
     /// particularly useful when having two tasks correlating to
     /// transmitting and receiving.
-    pub fn split(self) -> (UartTx<'d, T, M>, UartRx<'d, T, M>) {
+    pub fn split(self) -> (UartTx<'d, M>, UartRx<'d, M>) {
         (self.tx, self.rx)
     }
 }
 
-fn reconfigure<T: BasicInstance>(config: &Config) -> Result<(), ConfigError> {
-    T::Interrupt::disable();
-    let r = T::regs();
+fn reconfigure(info: &Info, kernel_clock: Hertz, config: &Config) -> Result<(), ConfigError> {
+    info.interrupt.disable();
+    let r = info.regs;
 
     let cr = r.cr1().read();
-    configure(r, config, T::frequency(), T::KIND, cr.re(), cr.te())?;
+    configure(info, kernel_clock, config, cr.re(), cr.te())?;
 
-    T::Interrupt::unpend();
-    unsafe { T::Interrupt::enable() };
+    info.interrupt.unpend();
+    unsafe { info.interrupt.enable() };
 
     Ok(())
 }
 
 fn configure(
-    r: Regs,
+    info: &Info,
+    kernel_clock: Hertz,
     config: &Config,
-    pclk_freq: Hertz,
-    kind: Kind,
     enable_rx: bool,
     enable_tx: bool,
 ) -> Result<(), ConfigError> {
+    let r = info.regs;
+    let kind = info.kind;
+
     if !enable_rx && !enable_tx {
         return Err(ConfigError::RxOrTxNotEnabled);
     }
@@ -1348,7 +1384,7 @@ fn configure(
     let mut over8 = false;
     let mut found_brr = None;
     for &(presc, _presc_val) in &DIVS {
-        let brr = calculate_brr(config.baudrate, pclk_freq.0, presc as u32, mul);
+        let brr = calculate_brr(config.baudrate, kernel_clock.0, presc as u32, mul);
         trace!(
             "USART: presc={}, div=0x{:08x} (mantissa = {}, fraction = {})",
             presc,
@@ -1389,7 +1425,7 @@ fn configure(
         "Using {} oversampling, desired baudrate: {}, actual baudrate: {}",
         oversampling,
         config.baudrate,
-        pclk_freq.0 / brr * mul
+        kernel_clock.0 / brr * mul
     );
 
     r.cr2().write(|w| {
@@ -1458,14 +1494,14 @@ fn configure(
     Ok(())
 }
 
-impl<'d, T: BasicInstance, M: Mode> embedded_hal_02::serial::Read<u8> for UartRx<'d, T, M> {
+impl<'d, M: Mode> embedded_hal_02::serial::Read<u8> for UartRx<'d, M> {
     type Error = Error;
     fn read(&mut self) -> Result<u8, nb::Error<Self::Error>> {
         self.nb_read()
     }
 }
 
-impl<'d, T: BasicInstance, M: Mode> embedded_hal_02::blocking::serial::Write<u8> for UartTx<'d, T, M> {
+impl<'d, M: Mode> embedded_hal_02::blocking::serial::Write<u8> for UartTx<'d, M> {
     type Error = Error;
     fn bwrite_all(&mut self, buffer: &[u8]) -> Result<(), Self::Error> {
         self.blocking_write(buffer)
@@ -1475,14 +1511,14 @@ impl<'d, T: BasicInstance, M: Mode> embedded_hal_02::blocking::serial::Write<u8>
     }
 }
 
-impl<'d, T: BasicInstance, M: Mode> embedded_hal_02::serial::Read<u8> for Uart<'d, T, M> {
+impl<'d, M: Mode> embedded_hal_02::serial::Read<u8> for Uart<'d, M> {
     type Error = Error;
     fn read(&mut self) -> Result<u8, nb::Error<Self::Error>> {
         self.nb_read()
     }
 }
 
-impl<'d, T: BasicInstance, M: Mode> embedded_hal_02::blocking::serial::Write<u8> for Uart<'d, T, M> {
+impl<'d, M: Mode> embedded_hal_02::blocking::serial::Write<u8> for Uart<'d, M> {
     type Error = Error;
     fn bwrite_all(&mut self, buffer: &[u8]) -> Result<(), Self::Error> {
         self.blocking_write(buffer)
@@ -1504,25 +1540,25 @@ impl embedded_hal_nb::serial::Error for Error {
     }
 }
 
-impl<'d, T: BasicInstance, M: Mode> embedded_hal_nb::serial::ErrorType for Uart<'d, T, M> {
+impl<'d, M: Mode> embedded_hal_nb::serial::ErrorType for Uart<'d, M> {
     type Error = Error;
 }
 
-impl<'d, T: BasicInstance, M: Mode> embedded_hal_nb::serial::ErrorType for UartTx<'d, T, M> {
+impl<'d, M: Mode> embedded_hal_nb::serial::ErrorType for UartTx<'d, M> {
     type Error = Error;
 }
 
-impl<'d, T: BasicInstance, M: Mode> embedded_hal_nb::serial::ErrorType for UartRx<'d, T, M> {
+impl<'d, M: Mode> embedded_hal_nb::serial::ErrorType for UartRx<'d, M> {
     type Error = Error;
 }
 
-impl<'d, T: BasicInstance, M: Mode> embedded_hal_nb::serial::Read for UartRx<'d, T, M> {
+impl<'d, M: Mode> embedded_hal_nb::serial::Read for UartRx<'d, M> {
     fn read(&mut self) -> nb::Result<u8, Self::Error> {
         self.nb_read()
     }
 }
 
-impl<'d, T: BasicInstance, M: Mode> embedded_hal_nb::serial::Write for UartTx<'d, T, M> {
+impl<'d, M: Mode> embedded_hal_nb::serial::Write for UartTx<'d, M> {
     fn write(&mut self, char: u8) -> nb::Result<(), Self::Error> {
         self.blocking_write(&[char]).map_err(nb::Error::Other)
     }
@@ -1532,13 +1568,13 @@ impl<'d, T: BasicInstance, M: Mode> embedded_hal_nb::serial::Write for UartTx<'d
     }
 }
 
-impl<'d, T: BasicInstance, M: Mode> embedded_hal_nb::serial::Read for Uart<'d, T, M> {
+impl<'d, M: Mode> embedded_hal_nb::serial::Read for Uart<'d, M> {
     fn read(&mut self) -> Result<u8, nb::Error<Self::Error>> {
         self.nb_read()
     }
 }
 
-impl<'d, T: BasicInstance, M: Mode> embedded_hal_nb::serial::Write for Uart<'d, T, M> {
+impl<'d, M: Mode> embedded_hal_nb::serial::Write for Uart<'d, M> {
     fn write(&mut self, char: u8) -> nb::Result<(), Self::Error> {
         self.blocking_write(&[char]).map_err(nb::Error::Other)
     }
@@ -1554,24 +1590,15 @@ impl embedded_io::Error for Error {
     }
 }
 
-impl<T, M: Mode> embedded_io::ErrorType for Uart<'_, T, M>
-where
-    T: BasicInstance,
-{
+impl<M: Mode> embedded_io::ErrorType for Uart<'_, M> {
     type Error = Error;
 }
 
-impl<T, M: Mode> embedded_io::ErrorType for UartTx<'_, T, M>
-where
-    T: BasicInstance,
-{
+impl<M: Mode> embedded_io::ErrorType for UartTx<'_, M> {
     type Error = Error;
 }
 
-impl<T, M: Mode> embedded_io::Write for Uart<'_, T, M>
-where
-    T: BasicInstance,
-{
+impl<M: Mode> embedded_io::Write for Uart<'_, M> {
     fn write(&mut self, buf: &[u8]) -> Result<usize, Self::Error> {
         self.blocking_write(buf)?;
         Ok(buf.len())
@@ -1582,10 +1609,7 @@ where
     }
 }
 
-impl<T, M: Mode> embedded_io::Write for UartTx<'_, T, M>
-where
-    T: BasicInstance,
-{
+impl<M: Mode> embedded_io::Write for UartTx<'_, M> {
     fn write(&mut self, buf: &[u8]) -> Result<usize, Self::Error> {
         self.blocking_write(buf)?;
         Ok(buf.len())
@@ -1596,10 +1620,7 @@ where
     }
 }
 
-impl<T> embedded_io_async::Write for Uart<'_, T, Async>
-where
-    T: BasicInstance,
-{
+impl embedded_io_async::Write for Uart<'_, Async> {
     async fn write(&mut self, buf: &[u8]) -> Result<usize, Self::Error> {
         self.write(buf).await?;
         Ok(buf.len())
@@ -1610,10 +1631,7 @@ where
     }
 }
 
-impl<T> embedded_io_async::Write for UartTx<'_, T, Async>
-where
-    T: BasicInstance,
-{
+impl embedded_io_async::Write for UartTx<'_, Async> {
     async fn write(&mut self, buf: &[u8]) -> Result<usize, Self::Error> {
         self.write(buf).await?;
         Ok(buf.len())
@@ -1686,72 +1704,75 @@ enum Kind {
 
 struct State {
     rx_waker: AtomicWaker,
+    tx_rx_refcount: AtomicU8,
 }
 
 impl State {
     const fn new() -> Self {
         Self {
             rx_waker: AtomicWaker::new(),
+            tx_rx_refcount: AtomicU8::new(0),
         }
     }
 }
 
-trait SealedBasicInstance: crate::rcc::RccPeripheral {
-    const KIND: Kind;
+struct Info {
+    regs: Regs,
+    enable_bit: ClockEnableBit,
+    interrupt: Interrupt,
+    kind: Kind,
+}
 
-    fn regs() -> Regs;
+#[allow(private_interfaces)]
+pub(crate) trait SealedInstance: crate::rcc::RccPeripheral {
+    fn info() -> &'static Info;
     fn state() -> &'static State;
-
     fn buffered_state() -> &'static buffered::State;
 }
 
-trait SealedFullInstance: SealedBasicInstance {
-    #[allow(unused)]
-    fn regs_uart() -> crate::pac::usart::Usart;
-}
-
-/// Basic UART driver instance
+/// USART peripheral instance trait.
 #[allow(private_bounds)]
-pub trait BasicInstance: Peripheral<P = Self> + SealedBasicInstance + 'static + Send {
-    /// Interrupt for this instance.
+pub trait Instance: Peripheral<P = Self> + SealedInstance + 'static + Send {
+    /// Interrupt for this peripheral.
     type Interrupt: interrupt::typelevel::Interrupt;
 }
 
-/// Full UART driver instance
-#[allow(private_bounds)]
-pub trait FullInstance: SealedFullInstance {}
+pin_trait!(RxPin, Instance);
+pin_trait!(TxPin, Instance);
+pin_trait!(CtsPin, Instance);
+pin_trait!(RtsPin, Instance);
+pin_trait!(CkPin, Instance);
+pin_trait!(DePin, Instance);
 
-pin_trait!(RxPin, BasicInstance);
-pin_trait!(TxPin, BasicInstance);
-pin_trait!(CtsPin, BasicInstance);
-pin_trait!(RtsPin, BasicInstance);
-pin_trait!(CkPin, BasicInstance);
-pin_trait!(DePin, BasicInstance);
-
-dma_trait!(TxDma, BasicInstance);
-dma_trait!(RxDma, BasicInstance);
+dma_trait!(TxDma, Instance);
+dma_trait!(RxDma, Instance);
 
 macro_rules! impl_usart {
     ($inst:ident, $irq:ident, $kind:expr) => {
-        impl SealedBasicInstance for crate::peripherals::$inst {
-            const KIND: Kind = $kind;
-
-            fn regs() -> Regs {
-                unsafe { Regs::from_ptr(crate::pac::$inst.as_ptr()) }
+        #[allow(private_interfaces)]
+        impl SealedInstance for crate::peripherals::$inst {
+            fn info() -> &'static Info {
+                static INFO: Info = Info {
+                    regs: unsafe { Regs::from_ptr(crate::pac::$inst.as_ptr()) },
+                    enable_bit: crate::peripherals::$inst::ENABLE_BIT,
+                    interrupt: crate::interrupt::typelevel::$irq::IRQ,
+                    kind: $kind,
+                };
+                &INFO
             }
 
-            fn state() -> &'static crate::usart::State {
-                static STATE: crate::usart::State = crate::usart::State::new();
+            fn state() -> &'static State {
+                static STATE: State = State::new();
                 &STATE
             }
 
             fn buffered_state() -> &'static buffered::State {
-                static STATE: buffered::State = buffered::State::new();
-                &STATE
+                static BUFFERED_STATE: buffered::State = buffered::State::new();
+                &BUFFERED_STATE
             }
         }
 
-        impl BasicInstance for peripherals::$inst {
+        impl Instance for crate::peripherals::$inst {
             type Interrupt = crate::interrupt::typelevel::$irq;
         }
     };
@@ -1761,16 +1782,7 @@ foreach_interrupt!(
     ($inst:ident, usart, LPUART, $signal_name:ident, $irq:ident) => {
         impl_usart!($inst, $irq, Kind::Lpuart);
     };
-
     ($inst:ident, usart, $block:ident, $signal_name:ident, $irq:ident) => {
         impl_usart!($inst, $irq, Kind::Uart);
-
-        impl SealedFullInstance for peripherals::$inst {
-            fn regs_uart() -> crate::pac::usart::Usart {
-                crate::pac::$inst
-            }
-        }
-
-        impl FullInstance for peripherals::$inst {}
     };
 );
diff --git a/embassy-stm32/src/usart/ringbuffered.rs b/embassy-stm32/src/usart/ringbuffered.rs
index 0a6491bd5..f3a88b93f 100644
--- a/embassy-stm32/src/usart/ringbuffered.rs
+++ b/embassy-stm32/src/usart/ringbuffered.rs
@@ -1,5 +1,4 @@
 use core::future::poll_fn;
-use core::marker::PhantomData;
 use core::mem;
 use core::sync::atomic::{compiler_fence, Ordering};
 use core::task::Poll;
@@ -7,20 +6,23 @@ use core::task::Poll;
 use embassy_embedded_hal::SetConfig;
 use futures_util::future::{select, Either};
 
-use super::{clear_interrupt_flags, rdr, reconfigure, sr, BasicInstance, Config, ConfigError, Error, UartRx};
+use super::{clear_interrupt_flags, rdr, reconfigure, sr, Config, ConfigError, Error, Info, State, UartRx};
 use crate::dma::ReadableRingBuffer;
 use crate::mode::Async;
+use crate::time::Hertz;
 use crate::usart::{Regs, Sr};
 
 /// Rx-only Ring-buffered UART Driver
 ///
 /// Created with [UartRx::into_ring_buffered]
-pub struct RingBufferedUartRx<'d, T: BasicInstance> {
-    _phantom: PhantomData<T>,
+pub struct RingBufferedUartRx<'d> {
+    info: &'static Info,
+    state: &'static State,
+    kernel_clock: Hertz,
     ring_buf: ReadableRingBuffer<'d, u8>,
 }
 
-impl<'d, T: BasicInstance> SetConfig for RingBufferedUartRx<'d, T> {
+impl<'d> SetConfig for RingBufferedUartRx<'d> {
     type Config = Config;
     type ConfigError = ConfigError;
 
@@ -29,11 +31,11 @@ impl<'d, T: BasicInstance> SetConfig for RingBufferedUartRx<'d, T> {
     }
 }
 
-impl<'d, T: BasicInstance> UartRx<'d, T, Async> {
+impl<'d> UartRx<'d, Async> {
     /// Turn the `UartRx` into a buffered uart which can continously receive in the background
     /// without the possibility of losing bytes. The `dma_buf` is a buffer registered to the
     /// DMA controller, and must be large enough to prevent overflows.
-    pub fn into_ring_buffered(mut self, dma_buf: &'d mut [u8]) -> RingBufferedUartRx<'d, T> {
+    pub fn into_ring_buffered(mut self, dma_buf: &'d mut [u8]) -> RingBufferedUartRx<'d> {
         assert!(!dma_buf.is_empty() && dma_buf.len() <= 0xFFFF);
 
         let opts = Default::default();
@@ -43,19 +45,24 @@ impl<'d, T: BasicInstance> UartRx<'d, T, Async> {
         let request = rx_dma.request;
         let rx_dma = unsafe { rx_dma.channel.clone_unchecked() };
 
-        let ring_buf = unsafe { ReadableRingBuffer::new(rx_dma, request, rdr(T::regs()), dma_buf, opts) };
+        let info = self.info;
+        let state = self.state;
+        let kernel_clock = self.kernel_clock;
+        let ring_buf = unsafe { ReadableRingBuffer::new(rx_dma, request, rdr(info.regs), dma_buf, opts) };
 
         // Don't disable the clock
         mem::forget(self);
 
         RingBufferedUartRx {
-            _phantom: PhantomData,
+            info,
+            state,
+            kernel_clock,
             ring_buf,
         }
     }
 }
 
-impl<'d, T: BasicInstance> RingBufferedUartRx<'d, T> {
+impl<'d> RingBufferedUartRx<'d> {
     /// Clear the ring buffer and start receiving in the background
     pub fn start(&mut self) -> Result<(), Error> {
         // Clear the ring buffer so that it is ready to receive data
@@ -74,7 +81,7 @@ impl<'d, T: BasicInstance> RingBufferedUartRx<'d, T> {
 
     /// Reconfigure the driver
     pub fn set_config(&mut self, config: &Config) -> Result<(), ConfigError> {
-        reconfigure::<T>(config)
+        reconfigure(self.info, self.kernel_clock, config)
     }
 
     /// Start uart background receive
@@ -85,7 +92,7 @@ impl<'d, T: BasicInstance> RingBufferedUartRx<'d, T> {
         // start the dma controller
         self.ring_buf.start();
 
-        let r = T::regs();
+        let r = self.info.regs;
         // clear all interrupts and DMA Rx Request
         r.cr1().modify(|w| {
             // disable RXNE interrupt
@@ -107,7 +114,7 @@ impl<'d, T: BasicInstance> RingBufferedUartRx<'d, T> {
     fn teardown_uart(&mut self) {
         self.ring_buf.request_stop();
 
-        let r = T::regs();
+        let r = self.info.regs;
         // clear all interrupts and DMA Rx Request
         r.cr1().modify(|w| {
             // disable RXNE interrupt
@@ -136,14 +143,14 @@ impl<'d, T: BasicInstance> RingBufferedUartRx<'d, T> {
     /// Receive in the background is terminated if an error is returned.
     /// It must then manually be started again by calling `start()` or by re-calling `read()`.
     pub async fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
-        let r = T::regs();
+        let r = self.info.regs;
 
         // Start background receive if it was not already started
         if !r.cr3().read().dmar() {
             self.start()?;
         }
 
-        check_for_errors(clear_idle_flag(T::regs()))?;
+        check_for_errors(clear_idle_flag(r))?;
 
         loop {
             match self.ring_buf.read(buf) {
@@ -184,15 +191,15 @@ impl<'d, T: BasicInstance> RingBufferedUartRx<'d, T> {
         });
 
         // Future which completes when idle line is detected
+        let s = self.state;
         let uart = poll_fn(|cx| {
-            let s = T::state();
             s.rx_waker.register(cx.waker());
 
             compiler_fence(Ordering::SeqCst);
 
             // Critical section is needed so that IDLE isn't set after
             // our read but before we clear it.
-            let sr = critical_section::with(|_| clear_idle_flag(T::regs()));
+            let sr = critical_section::with(|_| clear_idle_flag(self.info.regs));
 
             check_for_errors(sr)?;
 
@@ -211,13 +218,13 @@ impl<'d, T: BasicInstance> RingBufferedUartRx<'d, T> {
     }
 }
 
-impl<T: BasicInstance> Drop for RingBufferedUartRx<'_, T> {
+impl Drop for RingBufferedUartRx<'_> {
     fn drop(&mut self) {
         self.teardown_uart();
-
-        T::disable();
+        super::drop_tx_rx(self.info, self.state);
     }
 }
+
 /// Return an error result if the Sr register has errors
 fn check_for_errors(s: Sr) -> Result<(), Error> {
     if s.pe() {
@@ -248,17 +255,11 @@ fn clear_idle_flag(r: Regs) -> Sr {
     sr
 }
 
-impl<T> embedded_io_async::ErrorType for RingBufferedUartRx<'_, T>
-where
-    T: BasicInstance,
-{
+impl embedded_io_async::ErrorType for RingBufferedUartRx<'_> {
     type Error = Error;
 }
 
-impl<T> embedded_io_async::Read for RingBufferedUartRx<'_, T>
-where
-    T: BasicInstance,
-{
+impl embedded_io_async::Read for RingBufferedUartRx<'_> {
     async fn read(&mut self, buf: &mut [u8]) -> Result<usize, Self::Error> {
         self.read(buf).await
     }
diff --git a/examples/stm32h5/src/bin/usart_split.rs b/examples/stm32h5/src/bin/usart_split.rs
index 77b4caa9e..d26c5003c 100644
--- a/examples/stm32h5/src/bin/usart_split.rs
+++ b/examples/stm32h5/src/bin/usart_split.rs
@@ -4,7 +4,6 @@
 use defmt::*;
 use embassy_executor::Spawner;
 use embassy_stm32::mode::Async;
-use embassy_stm32::peripherals::UART7;
 use embassy_stm32::usart::{Config, Uart, UartRx};
 use embassy_stm32::{bind_interrupts, peripherals, usart};
 use embassy_sync::blocking_mutex::raw::ThreadModeRawMutex;
@@ -38,7 +37,7 @@ async fn main(spawner: Spawner) -> ! {
 }
 
 #[embassy_executor::task]
-async fn reader(mut rx: UartRx<'static, UART7, Async>) {
+async fn reader(mut rx: UartRx<'static, Async>) {
     let mut buf = [0; 8];
     loop {
         info!("reading...");
diff --git a/examples/stm32h7/src/bin/usart_split.rs b/examples/stm32h7/src/bin/usart_split.rs
index 4ad8e77ce..2bb58be5e 100644
--- a/examples/stm32h7/src/bin/usart_split.rs
+++ b/examples/stm32h7/src/bin/usart_split.rs
@@ -4,7 +4,6 @@
 use defmt::*;
 use embassy_executor::Spawner;
 use embassy_stm32::mode::Async;
-use embassy_stm32::peripherals::UART7;
 use embassy_stm32::usart::{Config, Uart, UartRx};
 use embassy_stm32::{bind_interrupts, peripherals, usart};
 use embassy_sync::blocking_mutex::raw::ThreadModeRawMutex;
@@ -38,7 +37,7 @@ async fn main(spawner: Spawner) -> ! {
 }
 
 #[embassy_executor::task]
-async fn reader(mut rx: UartRx<'static, UART7, Async>) {
+async fn reader(mut rx: UartRx<'static, Async>) {
     let mut buf = [0; 8];
     loop {
         info!("reading...");
diff --git a/examples/stm32h7rs/src/bin/usart_split.rs b/examples/stm32h7rs/src/bin/usart_split.rs
index 77b4caa9e..d26c5003c 100644
--- a/examples/stm32h7rs/src/bin/usart_split.rs
+++ b/examples/stm32h7rs/src/bin/usart_split.rs
@@ -4,7 +4,6 @@
 use defmt::*;
 use embassy_executor::Spawner;
 use embassy_stm32::mode::Async;
-use embassy_stm32::peripherals::UART7;
 use embassy_stm32::usart::{Config, Uart, UartRx};
 use embassy_stm32::{bind_interrupts, peripherals, usart};
 use embassy_sync::blocking_mutex::raw::ThreadModeRawMutex;
@@ -38,7 +37,7 @@ async fn main(spawner: Spawner) -> ! {
 }
 
 #[embassy_executor::task]
-async fn reader(mut rx: UartRx<'static, UART7, Async>) {
+async fn reader(mut rx: UartRx<'static, Async>) {
     let mut buf = [0; 8];
     loop {
         info!("reading...");
diff --git a/tests/stm32/src/bin/usart_rx_ringbuffered.rs b/tests/stm32/src/bin/usart_rx_ringbuffered.rs
index 908452eaf..ea1e52358 100644
--- a/tests/stm32/src/bin/usart_rx_ringbuffered.rs
+++ b/tests/stm32/src/bin/usart_rx_ringbuffered.rs
@@ -52,7 +52,7 @@ async fn main(spawner: Spawner) {
 }
 
 #[embassy_executor::task]
-async fn transmit_task(mut tx: UartTx<'static, peris::UART, Async>) {
+async fn transmit_task(mut tx: UartTx<'static, Async>) {
     // workaround https://github.com/embassy-rs/embassy/issues/1426
     Timer::after_millis(100).await;
 
@@ -75,7 +75,7 @@ async fn transmit_task(mut tx: UartTx<'static, peris::UART, Async>) {
 }
 
 #[embassy_executor::task]
-async fn receive_task(mut rx: RingBufferedUartRx<'static, peris::UART>) {
+async fn receive_task(mut rx: RingBufferedUartRx<'static>) {
     info!("Ready to receive...");
 
     let mut rng = ChaCha8Rng::seed_from_u64(1337);