1
Fork 0

Remove separate encoding for a single nonzero-mapping byte

In practice, for the two data sets that still use the bitset encoding (uppercase
and lowercase) this is not a significant win, so just drop it entirely. It costs
us about 5 bytes, and the complexity is nontrivial.
This commit is contained in:
Mark Rousskov 2020-03-27 18:01:14 -04:00
parent 9c1ceece20
commit b6bc906004
3 changed files with 9 additions and 46 deletions

View file

@ -10,7 +10,6 @@ fn bitset_search<
>(
needle: u32,
chunk_idx_map: &[u8; N],
last_chunk_idx: u16,
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
bitset_canonical: &[u64; CANONICAL],
bitset_canonicalized: &[(u8, u8); CANONICALIZED],
@ -18,12 +17,8 @@ fn bitset_search<
let bucket_idx = (needle / 64) as usize;
let chunk_map_idx = bucket_idx / CHUNK_SIZE;
let chunk_piece = bucket_idx % CHUNK_SIZE;
// The last entry of `chunk_idx_map` actually should be at `last_chunk_idx`,
// so we need to remap it
let chunk_idx = if chunk_map_idx < (chunk_idx_map.len() - 1) {
chunk_idx_map[chunk_map_idx]
} else if chunk_map_idx == last_chunk_idx as usize {
chunk_idx_map[chunk_idx_map.len() - 1]
let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) {
v
} else {
return false;
};
@ -317,12 +312,12 @@ pub mod grapheme_extend {
#[rustfmt::skip]
pub mod lowercase {
const BITSET_LAST_CHUNK_MAP: u16 = 122;
static BITSET_CHUNKS_MAP: [u8; 119] = [
static BITSET_CHUNKS_MAP: [u8; 123] = [
13, 16, 0, 0, 8, 0, 0, 11, 12, 9, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 3, 1, 0, 14, 0, 7, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 6,
0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0,
0, 0, 6,
];
static BITSET_INDEX_CHUNKS: [[u8; 16]; 18] = [
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
@ -408,7 +403,6 @@ pub mod lowercase {
super::bitset_search(
c as u32,
&BITSET_CHUNKS_MAP,
BITSET_LAST_CHUNK_MAP,
&BITSET_INDEX_CHUNKS,
&BITSET_CANONICAL,
&BITSET_MAPPING,
@ -449,13 +443,12 @@ pub mod n {
#[rustfmt::skip]
pub mod uppercase {
const BITSET_LAST_CHUNK_MAP: u16 = 124;
static BITSET_CHUNKS_MAP: [u8; 124] = [
static BITSET_CHUNKS_MAP: [u8; 125] = [
12, 15, 5, 5, 0, 5, 5, 2, 4, 11, 5, 14, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 6, 5, 13, 5, 10, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 7, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 16, 5, 5,
5, 5, 9, 3,
5, 5, 9, 5, 3,
];
static BITSET_INDEX_CHUNKS: [[u8; 16]; 17] = [
[41, 41, 5, 33, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 5, 0],
@ -529,7 +522,6 @@ pub mod uppercase {
super::bitset_search(
c as u32,
&BITSET_CHUNKS_MAP,
BITSET_LAST_CHUNK_MAP,
&BITSET_INDEX_CHUNKS,
&BITSET_CANONICAL,
&BITSET_MAPPING,

View file

@ -8,7 +8,6 @@ fn bitset_search<
>(
needle: u32,
chunk_idx_map: &[u8; N],
last_chunk_idx: u16,
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
bitset_canonical: &[u64; CANONICAL],
bitset_canonicalized: &[(u8, u8); CANONICALIZED],
@ -16,12 +15,8 @@ fn bitset_search<
let bucket_idx = (needle / 64) as usize;
let chunk_map_idx = bucket_idx / CHUNK_SIZE;
let chunk_piece = bucket_idx % CHUNK_SIZE;
// The last entry of `chunk_idx_map` actually should be at `last_chunk_idx`,
// so we need to remap it
let chunk_idx = if chunk_map_idx < (chunk_idx_map.len() - 1) {
chunk_idx_map[chunk_map_idx]
} else if chunk_map_idx == last_chunk_idx as usize {
chunk_idx_map[chunk_idx_map.len() - 1]
let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) {
v
} else {
return false;
};

View file

@ -139,7 +139,6 @@ impl RawEmitter {
writeln!(&mut self.file, " super::bitset_search(",).unwrap();
writeln!(&mut self.file, " c as u32,").unwrap();
writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
writeln!(&mut self.file, " BITSET_LAST_CHUNK_MAP,").unwrap();
writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();
writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap();
writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap();
@ -170,29 +169,6 @@ impl RawEmitter {
chunk_indices.push(chunk_map[chunk]);
}
// If one of the chunks has all of the entries point to the bitset
// word filled with zeros, then pop those off the end -- we know they
// are useless.
let zero_chunk_idx = chunks.iter().position(|chunk| chunk.iter().all(|e| *e == zero_at));
while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx {
chunk_indices.pop();
}
// We do not count the LAST_CHUNK_MAP as adding bytes because it's a
// small constant whose values are inlined directly into the instruction
// stream.
writeln!(
&mut self.file,
"const BITSET_LAST_CHUNK_MAP: u16 = {};",
chunk_indices.len() - 1,
)
.unwrap();
let nonzero = chunk_indices.pop().unwrap();
// Try to pop again, now that we've recorded a non-zero pointing index
// into the LAST_CHUNK_MAP.
while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx {
chunk_indices.pop();
}
chunk_indices.push(nonzero);
writeln!(
&mut self.file,
"static BITSET_CHUNKS_MAP: [u8; {}] = [{}];",