Skip to content

Commit 52e8856

Browse files
committed
Auto merge of #58556 - oli-obk:imperative_recursion, r=pnkfelix
Optimize copying large ranges of undefmask blocks Hopefully fixes #58523
2 parents ad8a3eb + 2a1eb1c commit 52e8856

File tree

3 files changed

+165
-39
lines changed

3 files changed

+165
-39
lines changed

src/librustc/mir/interpret/allocation.rs

+63-13
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,7 @@ impl AllocationExtra<(), ()> for () {
101101
impl<Tag, Extra> Allocation<Tag, Extra> {
102102
/// Creates a read-only allocation initialized by the given bytes
103103
pub fn from_bytes(slice: &[u8], align: Align, extra: Extra) -> Self {
104-
let mut undef_mask = UndefMask::new(Size::ZERO);
105-
undef_mask.grow(Size::from_bytes(slice.len() as u64), true);
104+
let undef_mask = UndefMask::new(Size::from_bytes(slice.len() as u64), true);
106105
Self {
107106
bytes: slice.to_owned(),
108107
relocations: Relocations::new(),
@@ -122,7 +121,7 @@ impl<Tag, Extra> Allocation<Tag, Extra> {
122121
Allocation {
123122
bytes: vec![0; size.bytes() as usize],
124123
relocations: Relocations::new(),
125-
undef_mask: UndefMask::new(size),
124+
undef_mask: UndefMask::new(size, false),
126125
align,
127126
mutability: Mutability::Mutable,
128127
extra,
@@ -614,8 +613,9 @@ impl<Tag> DerefMut for Relocations<Tag> {
614613
////////////////////////////////////////////////////////////////////////////////
615614

616615
type Block = u64;
617-
const BLOCK_SIZE: u64 = 64;
618616

617+
/// A bitmask where each bit refers to the byte with the same index. If the bit is `true`, the byte
618+
/// is defined. If it is `false` the byte is undefined.
619619
#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord, Hash, RustcEncodable, RustcDecodable)]
620620
pub struct UndefMask {
621621
blocks: Vec<Block>,
@@ -625,12 +625,14 @@ pub struct UndefMask {
625625
impl_stable_hash_for!(struct mir::interpret::UndefMask{blocks, len});
626626

627627
impl UndefMask {
628-
pub fn new(size: Size) -> Self {
628+
pub const BLOCK_SIZE: u64 = 64;
629+
630+
pub fn new(size: Size, state: bool) -> Self {
629631
let mut m = UndefMask {
630632
blocks: vec![],
631633
len: Size::ZERO,
632634
};
633-
m.grow(size, false);
635+
m.grow(size, state);
634636
m
635637
}
636638

@@ -644,6 +646,7 @@ impl UndefMask {
644646
return Err(self.len);
645647
}
646648

649+
// FIXME(oli-obk): optimize this for allocations larger than a block.
647650
let idx = (start.bytes()..end.bytes())
648651
.map(|i| Size::from_bytes(i))
649652
.find(|&i| !self.get(i));
@@ -663,20 +666,63 @@ impl UndefMask {
663666
}
664667

665668
pub fn set_range_inbounds(&mut self, start: Size, end: Size, new_state: bool) {
666-
for i in start.bytes()..end.bytes() {
667-
self.set(Size::from_bytes(i), new_state);
669+
let (blocka, bita) = bit_index(start);
670+
let (blockb, bitb) = bit_index(end);
671+
if blocka == blockb {
672+
// first set all bits but the first `bita`
673+
// then unset the last `64 - bitb` bits
674+
let range = if bitb == 0 {
675+
u64::max_value() << bita
676+
} else {
677+
(u64::max_value() << bita) & (u64::max_value() >> (64 - bitb))
678+
};
679+
if new_state {
680+
self.blocks[blocka] |= range;
681+
} else {
682+
self.blocks[blocka] &= !range;
683+
}
684+
return;
685+
}
686+
// across block boundaries
687+
if new_state {
688+
// set bita..64 to 1
689+
self.blocks[blocka] |= u64::max_value() << bita;
690+
// set 0..bitb to 1
691+
if bitb != 0 {
692+
self.blocks[blockb] |= u64::max_value() >> (64 - bitb);
693+
}
694+
// fill in all the other blocks (much faster than one bit at a time)
695+
for block in (blocka + 1) .. blockb {
696+
self.blocks[block] = u64::max_value();
697+
}
698+
} else {
699+
// set bita..64 to 0
700+
self.blocks[blocka] &= !(u64::max_value() << bita);
701+
// set 0..bitb to 0
702+
if bitb != 0 {
703+
self.blocks[blockb] &= !(u64::max_value() >> (64 - bitb));
704+
}
705+
// fill in all the other blocks (much faster than one bit at a time)
706+
for block in (blocka + 1) .. blockb {
707+
self.blocks[block] = 0;
708+
}
668709
}
669710
}
670711

671712
#[inline]
672713
pub fn get(&self, i: Size) -> bool {
673714
let (block, bit) = bit_index(i);
674-
(self.blocks[block] & 1 << bit) != 0
715+
(self.blocks[block] & (1 << bit)) != 0
675716
}
676717

677718
#[inline]
678719
pub fn set(&mut self, i: Size, new_state: bool) {
679720
let (block, bit) = bit_index(i);
721+
self.set_bit(block, bit, new_state);
722+
}
723+
724+
#[inline]
725+
fn set_bit(&mut self, block: usize, bit: usize, new_state: bool) {
680726
if new_state {
681727
self.blocks[block] |= 1 << bit;
682728
} else {
@@ -685,11 +731,15 @@ impl UndefMask {
685731
}
686732

687733
pub fn grow(&mut self, amount: Size, new_state: bool) {
688-
let unused_trailing_bits = self.blocks.len() as u64 * BLOCK_SIZE - self.len.bytes();
734+
if amount.bytes() == 0 {
735+
return;
736+
}
737+
let unused_trailing_bits = self.blocks.len() as u64 * Self::BLOCK_SIZE - self.len.bytes();
689738
if amount.bytes() > unused_trailing_bits {
690-
let additional_blocks = amount.bytes() / BLOCK_SIZE + 1;
739+
let additional_blocks = amount.bytes() / Self::BLOCK_SIZE + 1;
691740
assert_eq!(additional_blocks as usize as u64, additional_blocks);
692741
self.blocks.extend(
742+
// FIXME(oli-obk): optimize this by repeating `new_state as Block`
693743
iter::repeat(0).take(additional_blocks as usize),
694744
);
695745
}
@@ -702,8 +752,8 @@ impl UndefMask {
702752
#[inline]
703753
fn bit_index(bits: Size) -> (usize, usize) {
704754
let bits = bits.bytes();
705-
let a = bits / BLOCK_SIZE;
706-
let b = bits % BLOCK_SIZE;
755+
let a = bits / UndefMask::BLOCK_SIZE;
756+
let b = bits % UndefMask::BLOCK_SIZE;
707757
assert_eq!(a as usize as u64, a);
708758
assert_eq!(b as usize as u64, b);
709759
(a as usize, b as usize)

src/librustc_mir/interpret/memory.rs

+76-26
Original file line numberDiff line numberDiff line change
@@ -700,24 +700,29 @@ impl<'a, 'mir, 'tcx, M: Machine<'a, 'mir, 'tcx>> Memory<'a, 'mir, 'tcx, M> {
700700
// relocations overlapping the edges; those would not be handled correctly).
701701
let relocations = {
702702
let relocations = self.get(src.alloc_id)?.relocations(self, src, size);
703-
let mut new_relocations = Vec::with_capacity(relocations.len() * (length as usize));
704-
for i in 0..length {
705-
new_relocations.extend(
706-
relocations
707-
.iter()
708-
.map(|&(offset, reloc)| {
709-
// compute offset for current repetition
710-
let dest_offset = dest.offset + (i * size);
711-
(
712-
// shift offsets from source allocation to destination allocation
713-
offset + dest_offset - src.offset,
714-
reloc,
715-
)
716-
})
717-
);
718-
}
703+
if relocations.is_empty() {
704+
// nothing to copy, ignore even the `length` loop
705+
Vec::new()
706+
} else {
707+
let mut new_relocations = Vec::with_capacity(relocations.len() * (length as usize));
708+
for i in 0..length {
709+
new_relocations.extend(
710+
relocations
711+
.iter()
712+
.map(|&(offset, reloc)| {
713+
// compute offset for current repetition
714+
let dest_offset = dest.offset + (i * size);
715+
(
716+
// shift offsets from source allocation to destination allocation
717+
offset + dest_offset - src.offset,
718+
reloc,
719+
)
720+
})
721+
);
722+
}
719723

720-
new_relocations
724+
new_relocations
725+
}
721726
};
722727

723728
let tcx = self.tcx.tcx;
@@ -784,20 +789,65 @@ impl<'a, 'mir, 'tcx, M: Machine<'a, 'mir, 'tcx>> Memory<'a, 'mir, 'tcx, M> {
784789
// The bits have to be saved locally before writing to dest in case src and dest overlap.
785790
assert_eq!(size.bytes() as usize as u64, size.bytes());
786791

787-
let undef_mask = self.get(src.alloc_id)?.undef_mask.clone();
788-
let dest_allocation = self.get_mut(dest.alloc_id)?;
792+
let undef_mask = &self.get(src.alloc_id)?.undef_mask;
793+
794+
// Since we are copying `size` bytes from `src` to `dest + i * size` (`for i in 0..repeat`),
795+
// a naive undef mask copying algorithm would repeatedly have to read the undef mask from
796+
// the source and write it to the destination. Even if we optimized the memory accesses,
797+
// we'd be doing all of this `repeat` times.
798+
// Therefor we precompute a compressed version of the undef mask of the source value and
799+
// then write it back `repeat` times without computing any more information from the source.
800+
801+
// a precomputed cache for ranges of defined/undefined bits
802+
// 0000010010001110 will become
803+
// [5, 1, 2, 1, 3, 3, 1]
804+
// where each element toggles the state
805+
let mut ranges = smallvec::SmallVec::<[u64; 1]>::new();
806+
let first = undef_mask.get(src.offset);
807+
let mut cur_len = 1;
808+
let mut cur = first;
809+
for i in 1..size.bytes() {
810+
// FIXME: optimize to bitshift the current undef block's bits and read the top bit
811+
if undef_mask.get(src.offset + Size::from_bytes(i)) == cur {
812+
cur_len += 1;
813+
} else {
814+
ranges.push(cur_len);
815+
cur_len = 1;
816+
cur = !cur;
817+
}
818+
}
789819

790-
for i in 0..size.bytes() {
791-
let defined = undef_mask.get(src.offset + Size::from_bytes(i));
820+
// now fill in all the data
821+
let dest_allocation = self.get_mut(dest.alloc_id)?;
822+
// an optimization where we can just overwrite an entire range of definedness bits if
823+
// they are going to be uniformly `1` or `0`.
824+
if ranges.is_empty() {
825+
dest_allocation.undef_mask.set_range_inbounds(
826+
dest.offset,
827+
dest.offset + size * repeat,
828+
first,
829+
);
830+
return Ok(())
831+
}
792832

793-
for j in 0..repeat {
794-
dest_allocation.undef_mask.set(
795-
dest.offset + Size::from_bytes(i + (size.bytes() * j)),
796-
defined
833+
// remember to fill in the trailing bits
834+
ranges.push(cur_len);
835+
836+
for mut j in 0..repeat {
837+
j *= size.bytes();
838+
j += dest.offset.bytes();
839+
let mut cur = first;
840+
for range in &ranges {
841+
let old_j = j;
842+
j += range;
843+
dest_allocation.undef_mask.set_range_inbounds(
844+
Size::from_bytes(old_j),
845+
Size::from_bytes(j),
846+
cur,
797847
);
848+
cur = !cur;
798849
}
799850
}
800-
801851
Ok(())
802852
}
803853
}
+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// ignore-cross-compile
2+
// ignore-stage1
3+
4+
#![feature(rustc_private)]
5+
6+
extern crate rustc;
7+
8+
use rustc::mir::interpret::UndefMask;
9+
use rustc::ty::layout::Size;
10+
11+
fn main() {
12+
let mut mask = UndefMask::new(Size::from_bytes(500), false);
13+
assert!(!mask.get(Size::from_bytes(499)));
14+
mask.set(Size::from_bytes(499), true);
15+
assert!(mask.get(Size::from_bytes(499)));
16+
mask.set_range_inbounds(Size::from_bytes(100), Size::from_bytes(256), true);
17+
for i in 0..100 {
18+
assert!(!mask.get(Size::from_bytes(i)));
19+
}
20+
for i in 100..256 {
21+
assert!(mask.get(Size::from_bytes(i)));
22+
}
23+
for i in 256..499 {
24+
assert!(!mask.get(Size::from_bytes(i)));
25+
}
26+
}

0 commit comments

Comments
 (0)