Remove separate encoding for a single nonzero-mapping byte
In practice, for the two data sets that still use the bitset encoding (uppercase and lowercase) this is not a significant win, so just drop it entirely. It costs us about 5 bytes, and the complexity is nontrivial.
This commit is contained in:
parent
9c1ceece20
commit
b6bc906004
3 changed files with 9 additions and 46 deletions
|
@ -10,7 +10,6 @@ fn bitset_search<
|
||||||
>(
|
>(
|
||||||
needle: u32,
|
needle: u32,
|
||||||
chunk_idx_map: &[u8; N],
|
chunk_idx_map: &[u8; N],
|
||||||
last_chunk_idx: u16,
|
|
||||||
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
|
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
|
||||||
bitset_canonical: &[u64; CANONICAL],
|
bitset_canonical: &[u64; CANONICAL],
|
||||||
bitset_canonicalized: &[(u8, u8); CANONICALIZED],
|
bitset_canonicalized: &[(u8, u8); CANONICALIZED],
|
||||||
|
@ -18,12 +17,8 @@ fn bitset_search<
|
||||||
let bucket_idx = (needle / 64) as usize;
|
let bucket_idx = (needle / 64) as usize;
|
||||||
let chunk_map_idx = bucket_idx / CHUNK_SIZE;
|
let chunk_map_idx = bucket_idx / CHUNK_SIZE;
|
||||||
let chunk_piece = bucket_idx % CHUNK_SIZE;
|
let chunk_piece = bucket_idx % CHUNK_SIZE;
|
||||||
// The last entry of `chunk_idx_map` actually should be at `last_chunk_idx`,
|
let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) {
|
||||||
// so we need to remap it
|
v
|
||||||
let chunk_idx = if chunk_map_idx < (chunk_idx_map.len() - 1) {
|
|
||||||
chunk_idx_map[chunk_map_idx]
|
|
||||||
} else if chunk_map_idx == last_chunk_idx as usize {
|
|
||||||
chunk_idx_map[chunk_idx_map.len() - 1]
|
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
|
@ -317,12 +312,12 @@ pub mod grapheme_extend {
|
||||||
|
|
||||||
#[rustfmt::skip]
|
#[rustfmt::skip]
|
||||||
pub mod lowercase {
|
pub mod lowercase {
|
||||||
const BITSET_LAST_CHUNK_MAP: u16 = 122;
|
static BITSET_CHUNKS_MAP: [u8; 123] = [
|
||||||
static BITSET_CHUNKS_MAP: [u8; 119] = [
|
|
||||||
13, 16, 0, 0, 8, 0, 0, 11, 12, 9, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
13, 16, 0, 0, 8, 0, 0, 11, 12, 9, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
0, 0, 3, 1, 0, 14, 0, 7, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
0, 0, 3, 1, 0, 14, 0, 7, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 6,
|
0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0,
|
||||||
|
0, 0, 6,
|
||||||
];
|
];
|
||||||
static BITSET_INDEX_CHUNKS: [[u8; 16]; 18] = [
|
static BITSET_INDEX_CHUNKS: [[u8; 16]; 18] = [
|
||||||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||||
|
@ -408,7 +403,6 @@ pub mod lowercase {
|
||||||
super::bitset_search(
|
super::bitset_search(
|
||||||
c as u32,
|
c as u32,
|
||||||
&BITSET_CHUNKS_MAP,
|
&BITSET_CHUNKS_MAP,
|
||||||
BITSET_LAST_CHUNK_MAP,
|
|
||||||
&BITSET_INDEX_CHUNKS,
|
&BITSET_INDEX_CHUNKS,
|
||||||
&BITSET_CANONICAL,
|
&BITSET_CANONICAL,
|
||||||
&BITSET_MAPPING,
|
&BITSET_MAPPING,
|
||||||
|
@ -449,13 +443,12 @@ pub mod n {
|
||||||
|
|
||||||
#[rustfmt::skip]
|
#[rustfmt::skip]
|
||||||
pub mod uppercase {
|
pub mod uppercase {
|
||||||
const BITSET_LAST_CHUNK_MAP: u16 = 124;
|
static BITSET_CHUNKS_MAP: [u8; 125] = [
|
||||||
static BITSET_CHUNKS_MAP: [u8; 124] = [
|
|
||||||
12, 15, 5, 5, 0, 5, 5, 2, 4, 11, 5, 14, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
12, 15, 5, 5, 0, 5, 5, 2, 4, 11, 5, 14, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||||||
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||||||
5, 5, 5, 6, 5, 13, 5, 10, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
5, 5, 5, 6, 5, 13, 5, 10, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||||||
5, 7, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 16, 5, 5,
|
5, 7, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 16, 5, 5,
|
||||||
5, 5, 9, 3,
|
5, 5, 9, 5, 3,
|
||||||
];
|
];
|
||||||
static BITSET_INDEX_CHUNKS: [[u8; 16]; 17] = [
|
static BITSET_INDEX_CHUNKS: [[u8; 16]; 17] = [
|
||||||
[41, 41, 5, 33, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 5, 0],
|
[41, 41, 5, 33, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 5, 0],
|
||||||
|
@ -529,7 +522,6 @@ pub mod uppercase {
|
||||||
super::bitset_search(
|
super::bitset_search(
|
||||||
c as u32,
|
c as u32,
|
||||||
&BITSET_CHUNKS_MAP,
|
&BITSET_CHUNKS_MAP,
|
||||||
BITSET_LAST_CHUNK_MAP,
|
|
||||||
&BITSET_INDEX_CHUNKS,
|
&BITSET_INDEX_CHUNKS,
|
||||||
&BITSET_CANONICAL,
|
&BITSET_CANONICAL,
|
||||||
&BITSET_MAPPING,
|
&BITSET_MAPPING,
|
||||||
|
|
|
@ -8,7 +8,6 @@ fn bitset_search<
|
||||||
>(
|
>(
|
||||||
needle: u32,
|
needle: u32,
|
||||||
chunk_idx_map: &[u8; N],
|
chunk_idx_map: &[u8; N],
|
||||||
last_chunk_idx: u16,
|
|
||||||
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
|
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
|
||||||
bitset_canonical: &[u64; CANONICAL],
|
bitset_canonical: &[u64; CANONICAL],
|
||||||
bitset_canonicalized: &[(u8, u8); CANONICALIZED],
|
bitset_canonicalized: &[(u8, u8); CANONICALIZED],
|
||||||
|
@ -16,12 +15,8 @@ fn bitset_search<
|
||||||
let bucket_idx = (needle / 64) as usize;
|
let bucket_idx = (needle / 64) as usize;
|
||||||
let chunk_map_idx = bucket_idx / CHUNK_SIZE;
|
let chunk_map_idx = bucket_idx / CHUNK_SIZE;
|
||||||
let chunk_piece = bucket_idx % CHUNK_SIZE;
|
let chunk_piece = bucket_idx % CHUNK_SIZE;
|
||||||
// The last entry of `chunk_idx_map` actually should be at `last_chunk_idx`,
|
let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) {
|
||||||
// so we need to remap it
|
v
|
||||||
let chunk_idx = if chunk_map_idx < (chunk_idx_map.len() - 1) {
|
|
||||||
chunk_idx_map[chunk_map_idx]
|
|
||||||
} else if chunk_map_idx == last_chunk_idx as usize {
|
|
||||||
chunk_idx_map[chunk_idx_map.len() - 1]
|
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
|
|
|
@ -139,7 +139,6 @@ impl RawEmitter {
|
||||||
writeln!(&mut self.file, " super::bitset_search(",).unwrap();
|
writeln!(&mut self.file, " super::bitset_search(",).unwrap();
|
||||||
writeln!(&mut self.file, " c as u32,").unwrap();
|
writeln!(&mut self.file, " c as u32,").unwrap();
|
||||||
writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
|
writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
|
||||||
writeln!(&mut self.file, " BITSET_LAST_CHUNK_MAP,").unwrap();
|
|
||||||
writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();
|
writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();
|
||||||
writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap();
|
writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap();
|
||||||
writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap();
|
writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap();
|
||||||
|
@ -170,29 +169,6 @@ impl RawEmitter {
|
||||||
chunk_indices.push(chunk_map[chunk]);
|
chunk_indices.push(chunk_map[chunk]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// If one of the chunks has all of the entries point to the bitset
|
|
||||||
// word filled with zeros, then pop those off the end -- we know they
|
|
||||||
// are useless.
|
|
||||||
let zero_chunk_idx = chunks.iter().position(|chunk| chunk.iter().all(|e| *e == zero_at));
|
|
||||||
while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx {
|
|
||||||
chunk_indices.pop();
|
|
||||||
}
|
|
||||||
// We do not count the LAST_CHUNK_MAP as adding bytes because it's a
|
|
||||||
// small constant whose values are inlined directly into the instruction
|
|
||||||
// stream.
|
|
||||||
writeln!(
|
|
||||||
&mut self.file,
|
|
||||||
"const BITSET_LAST_CHUNK_MAP: u16 = {};",
|
|
||||||
chunk_indices.len() - 1,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
let nonzero = chunk_indices.pop().unwrap();
|
|
||||||
// Try to pop again, now that we've recorded a non-zero pointing index
|
|
||||||
// into the LAST_CHUNK_MAP.
|
|
||||||
while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx {
|
|
||||||
chunk_indices.pop();
|
|
||||||
}
|
|
||||||
chunk_indices.push(nonzero);
|
|
||||||
writeln!(
|
writeln!(
|
||||||
&mut self.file,
|
&mut self.file,
|
||||||
"static BITSET_CHUNKS_MAP: [u8; {}] = [{}];",
|
"static BITSET_CHUNKS_MAP: [u8; {}] = [{}];",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue