Auto merge of #3122 - RalfJung:intrptrcast-clean, r=saethlin

intptrcast: remove information about dead allocations The discussion in https://github.com/rust-lang/miri/pull/3103 convinced me we don't have to keep `int_to_ptr_map` around for dead allocations. But we should not make that depend on the GC, we can just tie it to when the allocation gets freed. That means everything still behaves deterministically, if anything weird happens (but it shouldn't). r? `@saethlin` Only the first and last commit contain logic changes, the 2nd commit just moves code around a bit.
2023-10-19 20:37:31 +00:00 · 2023-10-19 20:37:31 +00:00 · 1e71277b2d
commit 1e71277b2d
parent 5d62040fb6 ecaf828e13
3 changed files with 133 additions and 108 deletions
--- a/src/tools/miri/src/intptrcast.rs
+++ b/src/tools/miri/src/intptrcast.rs
@ -26,8 +26,10 @@ pub type GlobalState = RefCell<GlobalStateInner>;

 #[derive(Clone, Debug)]
 pub struct GlobalStateInner {
-    /// This is used as a map between the address of each allocation and its `AllocId`.
-    /// It is always sorted
+    /// This is used as a map between the address of each allocation and its `AllocId`. It is always
+    /// sorted. We cannot use a `HashMap` since we can be given an address that is offset from the
+    /// base address, and we need to find the `AllocId` it belongs to.
+    /// This is not the *full* inverse of `base_addr`; dead allocations have been removed.
    int_to_ptr_map: Vec<(u64, AllocId)>,
    /// The base address for each allocation.  We cannot put that into
    /// `AllocExtra` because function pointers also have a base address, and
@ -62,10 +64,21 @@ impl GlobalStateInner {
    }
 }

-impl<'mir, 'tcx> GlobalStateInner {
+/// Shifts `addr` to make it aligned with `align` by rounding `addr` to the smallest multiple
+/// of `align` that is larger or equal to `addr`
+fn align_addr(addr: u64, align: u64) -> u64 {
+    match addr % align {
+        0 => addr,
+        rem => addr.checked_add(align).unwrap() - rem,
+    }
+}
+
+impl<'mir, 'tcx: 'mir> EvalContextExtPriv<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {}
+trait EvalContextExtPriv<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
    // Returns the exposed `AllocId` that corresponds to the specified addr,
    // or `None` if the addr is out of bounds
-    fn alloc_id_from_addr(ecx: &MiriInterpCx<'mir, 'tcx>, addr: u64) -> Option<AllocId> {
+    fn alloc_id_from_addr(&self, addr: u64) -> Option<AllocId> {
+        let ecx = self.eval_context_ref();
        let global_state = ecx.machine.intptrcast.borrow();
        assert!(global_state.provenance_mode != ProvenanceMode::Strict);

@ -82,31 +95,90 @@ impl<'mir, 'tcx> GlobalStateInner {
                let (glb, alloc_id) = global_state.int_to_ptr_map[pos - 1];
                // This never overflows because `addr >= glb`
                let offset = addr - glb;
-                // If the offset exceeds the size of the allocation, don't use this `alloc_id`.
+                // We require this to be strict in-bounds of the allocation. This arm is only
+                // entered for addresses that are not the base address, so even zero-sized
+                // allocations will get recognized at their base address -- but all other
+                // allocations will *not* be recognized at their "end" address.
                let size = ecx.get_alloc_info(alloc_id).0;
-                if offset <= size.bytes() { Some(alloc_id) } else { None }
+                if offset < size.bytes() { Some(alloc_id) } else { None }
            }
        }?;

-        // We only use this provenance if it has been exposed, *and* is still live.
+        // We only use this provenance if it has been exposed.
        if global_state.exposed.contains(&alloc_id) {
-            let (_size, _align, kind) = ecx.get_alloc_info(alloc_id);
-            match kind {
-                AllocKind::LiveData | AllocKind::Function | AllocKind::VTable => {
-                    return Some(alloc_id);
-                }
-                AllocKind::Dead => {}
-            }
+            // This must still be live, since we remove allocations from `int_to_ptr_map` when they get freed.
+            debug_assert!(!matches!(ecx.get_alloc_info(alloc_id).2, AllocKind::Dead));
+            Some(alloc_id)
+        } else {
+            None
        }
-
-        None
    }

-    pub fn expose_ptr(
-        ecx: &mut MiriInterpCx<'mir, 'tcx>,
-        alloc_id: AllocId,
-        tag: BorTag,
-    ) -> InterpResult<'tcx> {
+    fn addr_from_alloc_id(&self, alloc_id: AllocId) -> InterpResult<'tcx, u64> {
+        let ecx = self.eval_context_ref();
+        let mut global_state = ecx.machine.intptrcast.borrow_mut();
+        let global_state = &mut *global_state;
+
+        Ok(match global_state.base_addr.entry(alloc_id) {
+            Entry::Occupied(entry) => *entry.get(),
+            Entry::Vacant(entry) => {
+                let (size, align, kind) = ecx.get_alloc_info(alloc_id);
+                // This is either called immediately after allocation (and then cached), or when
+                // adjusting `tcx` pointers (which never get freed). So assert that we are looking
+                // at a live allocation. This also ensures that we never re-assign an address to an
+                // allocation that previously had an address, but then was freed and the address
+                // information was removed.
+                assert!(!matches!(kind, AllocKind::Dead));
+
+                // This allocation does not have a base address yet, pick one.
+                // Leave some space to the previous allocation, to give it some chance to be less aligned.
+                let slack = {
+                    let mut rng = ecx.machine.rng.borrow_mut();
+                    // This means that `(global_state.next_base_addr + slack) % 16` is uniformly distributed.
+                    rng.gen_range(0..16)
+                };
+                // From next_base_addr + slack, round up to adjust for alignment.
+                let base_addr = global_state
+                    .next_base_addr
+                    .checked_add(slack)
+                    .ok_or_else(|| err_exhaust!(AddressSpaceFull))?;
+                let base_addr = align_addr(base_addr, align.bytes());
+                entry.insert(base_addr);
+                trace!(
+                    "Assigning base address {:#x} to allocation {:?} (size: {}, align: {}, slack: {})",
+                    base_addr,
+                    alloc_id,
+                    size.bytes(),
+                    align.bytes(),
+                    slack,
+                );
+
+                // Remember next base address.  If this allocation is zero-sized, leave a gap
+                // of at least 1 to avoid two allocations having the same base address.
+                // (The logic in `alloc_id_from_addr` assumes unique addresses, and different
+                // function/vtable pointers need to be distinguishable!)
+                global_state.next_base_addr = base_addr
+                    .checked_add(max(size.bytes(), 1))
+                    .ok_or_else(|| err_exhaust!(AddressSpaceFull))?;
+                // Even if `Size` didn't overflow, we might still have filled up the address space.
+                if global_state.next_base_addr > ecx.target_usize_max() {
+                    throw_exhaust!(AddressSpaceFull);
+                }
+                // Also maintain the opposite mapping in `int_to_ptr_map`.
+                // Given that `next_base_addr` increases in each allocation, pushing the
+                // corresponding tuple keeps `int_to_ptr_map` sorted
+                global_state.int_to_ptr_map.push((base_addr, alloc_id));
+
+                base_addr
+            }
+        })
+    }
+}
+
+impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {}
+pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
+    fn expose_ptr(&mut self, alloc_id: AllocId, tag: BorTag) -> InterpResult<'tcx> {
+        let ecx = self.eval_context_mut();
        let global_state = ecx.machine.intptrcast.get_mut();
        // In strict mode, we don't need this, so we can save some cycles by not tracking it.
        if global_state.provenance_mode != ProvenanceMode::Strict {
@ -119,14 +191,13 @@ impl<'mir, 'tcx> GlobalStateInner {
        Ok(())
    }

-    pub fn ptr_from_addr_cast(
-        ecx: &MiriInterpCx<'mir, 'tcx>,
-        addr: u64,
-    ) -> InterpResult<'tcx, Pointer<Option<Provenance>>> {
+    fn ptr_from_addr_cast(&self, addr: u64) -> InterpResult<'tcx, Pointer<Option<Provenance>>> {
        trace!("Casting {:#x} to a pointer", addr);

-        // Potentially emit a warning.
+        let ecx = self.eval_context_ref();
        let global_state = ecx.machine.intptrcast.borrow();
+
+        // Potentially emit a warning.
        match global_state.provenance_mode {
            ProvenanceMode::Default => {
                // The first time this happens at a particular location, print a warning.
@ -157,71 +228,16 @@ impl<'mir, 'tcx> GlobalStateInner {
        Ok(Pointer::new(Some(Provenance::Wildcard), Size::from_bytes(addr)))
    }

-    fn alloc_base_addr(
-        ecx: &MiriInterpCx<'mir, 'tcx>,
-        alloc_id: AllocId,
-    ) -> InterpResult<'tcx, u64> {
-        let mut global_state = ecx.machine.intptrcast.borrow_mut();
-        let global_state = &mut *global_state;
-
-        Ok(match global_state.base_addr.entry(alloc_id) {
-            Entry::Occupied(entry) => *entry.get(),
-            Entry::Vacant(entry) => {
-                // There is nothing wrong with a raw pointer being cast to an integer only after
-                // it became dangling.  Hence we allow dead allocations.
-                let (size, align, _kind) = ecx.get_alloc_info(alloc_id);
-
-                // This allocation does not have a base address yet, pick one.
-                // Leave some space to the previous allocation, to give it some chance to be less aligned.
-                let slack = {
-                    let mut rng = ecx.machine.rng.borrow_mut();
-                    // This means that `(global_state.next_base_addr + slack) % 16` is uniformly distributed.
-                    rng.gen_range(0..16)
-                };
-                // From next_base_addr + slack, round up to adjust for alignment.
-                let base_addr = global_state
-                    .next_base_addr
-                    .checked_add(slack)
-                    .ok_or_else(|| err_exhaust!(AddressSpaceFull))?;
-                let base_addr = Self::align_addr(base_addr, align.bytes());
-                entry.insert(base_addr);
-                trace!(
-                    "Assigning base address {:#x} to allocation {:?} (size: {}, align: {}, slack: {})",
-                    base_addr,
-                    alloc_id,
-                    size.bytes(),
-                    align.bytes(),
-                    slack,
-                );
-
-                // Remember next base address.  If this allocation is zero-sized, leave a gap
-                // of at least 1 to avoid two allocations having the same base address.
-                // (The logic in `alloc_id_from_addr` assumes unique addresses, and different
-                // function/vtable pointers need to be distinguishable!)
-                global_state.next_base_addr = base_addr
-                    .checked_add(max(size.bytes(), 1))
-                    .ok_or_else(|| err_exhaust!(AddressSpaceFull))?;
-                // Even if `Size` didn't overflow, we might still have filled up the address space.
-                if global_state.next_base_addr > ecx.target_usize_max() {
-                    throw_exhaust!(AddressSpaceFull);
-                }
-                // Given that `next_base_addr` increases in each allocation, pushing the
-                // corresponding tuple keeps `int_to_ptr_map` sorted
-                global_state.int_to_ptr_map.push((base_addr, alloc_id));
-
-                base_addr
-            }
-        })
-    }
-
    /// Convert a relative (tcx) pointer to a Miri pointer.
-    pub fn ptr_from_rel_ptr(
-        ecx: &MiriInterpCx<'mir, 'tcx>,
+    fn ptr_from_rel_ptr(
+        &self,
        ptr: Pointer<AllocId>,
        tag: BorTag,
    ) -> InterpResult<'tcx, Pointer<Provenance>> {
+        let ecx = self.eval_context_ref();
+
        let (alloc_id, offset) = ptr.into_parts(); // offset is relative (AllocId provenance)
-        let base_addr = GlobalStateInner::alloc_base_addr(ecx, alloc_id)?;
+        let base_addr = ecx.addr_from_alloc_id(alloc_id)?;

        // Add offset with the right kind of pointer-overflowing arithmetic.
        let dl = ecx.data_layout();
@ -231,22 +247,21 @@ impl<'mir, 'tcx> GlobalStateInner {

    /// When a pointer is used for a memory access, this computes where in which allocation the
    /// access is going.
-    pub fn ptr_get_alloc(
-        ecx: &MiriInterpCx<'mir, 'tcx>,
-        ptr: Pointer<Provenance>,
-    ) -> Option<(AllocId, Size)> {
+    fn ptr_get_alloc(&self, ptr: Pointer<Provenance>) -> Option<(AllocId, Size)> {
+        let ecx = self.eval_context_ref();
+
        let (tag, addr) = ptr.into_parts(); // addr is absolute (Tag provenance)

        let alloc_id = if let Provenance::Concrete { alloc_id, .. } = tag {
            alloc_id
        } else {
            // A wildcard pointer.
-            GlobalStateInner::alloc_id_from_addr(ecx, addr.bytes())?
+            ecx.alloc_id_from_addr(addr.bytes())?
        };

        // This cannot fail: since we already have a pointer with that provenance, rel_ptr_to_addr
-        // must have been called in the past.
-        let base_addr = GlobalStateInner::alloc_base_addr(ecx, alloc_id).unwrap();
+        // must have been called in the past, so we can just look up the address in the map.
+        let base_addr = ecx.addr_from_alloc_id(alloc_id).unwrap();

        // Wrapping "addr - base_addr"
        #[allow(clippy::cast_possible_wrap)] // we want to wrap here
@ -256,14 +271,24 @@ impl<'mir, 'tcx> GlobalStateInner {
            Size::from_bytes(ecx.overflowing_signed_offset(addr.bytes(), neg_base_addr).0),
        ))
    }
+}

-    /// Shifts `addr` to make it aligned with `align` by rounding `addr` to the smallest multiple
-    /// of `align` that is larger or equal to `addr`
-    fn align_addr(addr: u64, align: u64) -> u64 {
-        match addr % align {
-            0 => addr,
-            rem => addr.checked_add(align).unwrap() - rem,
-        }
+impl GlobalStateInner {
+    pub fn free_alloc_id(&mut self, dead_id: AllocId) {
+        // We can *not* remove this from `base_addr`, since the interpreter design requires that we
+        // be able to retrieve an AllocId + offset for any memory access *before* we check if the
+        // access is valid. Specifically, `ptr_get_alloc` is called on each attempt at a memory
+        // access to determine the allocation ID and offset -- and there can still be pointers with
+        // `dead_id` that one can attempt to use for a memory access. `ptr_get_alloc` may return
+        // `None` only if the pointer truly has no provenance (this ensures consistent error
+        // messages).
+        // However, we *can* remove it from `int_to_ptr_map`, since any wildcard pointers that exist
+        // can no longer actually be accessing that address. This ensures `alloc_id_from_addr` never
+        // returns a dead allocation.
+        self.int_to_ptr_map.retain(|&(_, id)| id != dead_id);
+        // We can also remove it from `exposed`, since this allocation can anyway not be returned by
+        // `alloc_id_from_addr` any more.
+        self.exposed.remove(&dead_id);
    }
 }

@ -273,7 +298,7 @@ mod tests {

    #[test]
    fn test_align_addr() {
-        assert_eq!(GlobalStateInner::align_addr(37, 4), 40);
-        assert_eq!(GlobalStateInner::align_addr(44, 4), 44);
+        assert_eq!(align_addr(37, 4), 40);
+        assert_eq!(align_addr(44, 4), 44);
    }
 }
--- a/src/tools/miri/src/lib.rs
+++ b/src/tools/miri/src/lib.rs
@ -117,7 +117,7 @@ pub use crate::eval::{
    create_ecx, eval_entry, AlignmentCheck, BacktraceStyle, IsolatedOp, MiriConfig, RejectOpWith,
 };
 pub use crate::helpers::EvalContextExt as _;
-pub use crate::intptrcast::ProvenanceMode;
+pub use crate::intptrcast::{EvalContextExt as _, ProvenanceMode};
 pub use crate::machine::{
    AllocExtra, FrameExtra, MiriInterpCx, MiriInterpCxExt, MiriMachine, MiriMemoryKind,
    PrimitiveLayouts, Provenance, ProvenanceExtra,
--- a/src/tools/miri/src/machine.rs
+++ b/src/tools/miri/src/machine.rs
@ -1149,7 +1149,7 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {
            // Value does not matter, SB is disabled
            BorTag::default()
        };
-        intptrcast::GlobalStateInner::ptr_from_rel_ptr(ecx, ptr, tag)
+        ecx.ptr_from_rel_ptr(ptr, tag)
    }

    /// Called on `usize as ptr` casts.
@ -1158,7 +1158,7 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {
        ecx: &MiriInterpCx<'mir, 'tcx>,
        addr: u64,
    ) -> InterpResult<'tcx, Pointer<Option<Self::Provenance>>> {
-        intptrcast::GlobalStateInner::ptr_from_addr_cast(ecx, addr)
+        ecx.ptr_from_addr_cast(addr)
    }

    /// Called on `ptr as usize` casts.
@ -1169,8 +1169,7 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {
        ptr: Pointer<Self::Provenance>,
    ) -> InterpResult<'tcx> {
        match ptr.provenance {
-            Provenance::Concrete { alloc_id, tag } =>
-                intptrcast::GlobalStateInner::expose_ptr(ecx, alloc_id, tag),
+            Provenance::Concrete { alloc_id, tag } => ecx.expose_ptr(alloc_id, tag),
            Provenance::Wildcard => {
                // No need to do anything for wildcard pointers as
                // their provenances have already been previously exposed.
@ -1191,7 +1190,7 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {
        ecx: &MiriInterpCx<'mir, 'tcx>,
        ptr: Pointer<Self::Provenance>,
    ) -> Option<(AllocId, Size, Self::ProvenanceExtra)> {
-        let rel = intptrcast::GlobalStateInner::ptr_get_alloc(ecx, ptr);
+        let rel = ecx.ptr_get_alloc(ptr);

        rel.map(|(alloc_id, size)| {
            let tag = match ptr.provenance {
@ -1263,6 +1262,7 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {
        {
            *deallocated_at = Some(machine.current_span());
        }
+        machine.intptrcast.get_mut().free_alloc_id(alloc_id);
        Ok(())
    }