Remove separate encoding for a single nonzero-mapping byte
In practice, for the two data sets that still use the bitset encoding (uppercase and lowercase) this is not a significant win, so just drop it entirely. It costs us about 5 bytes, and the complexity is nontrivial.
This commit is contained in:
parent
9c1ceece20
commit
b6bc906004
3 changed files with 9 additions and 46 deletions
|
@ -10,7 +10,6 @@ fn bitset_search<
|
|||
>(
|
||||
needle: u32,
|
||||
chunk_idx_map: &[u8; N],
|
||||
last_chunk_idx: u16,
|
||||
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
|
||||
bitset_canonical: &[u64; CANONICAL],
|
||||
bitset_canonicalized: &[(u8, u8); CANONICALIZED],
|
||||
|
@ -18,12 +17,8 @@ fn bitset_search<
|
|||
let bucket_idx = (needle / 64) as usize;
|
||||
let chunk_map_idx = bucket_idx / CHUNK_SIZE;
|
||||
let chunk_piece = bucket_idx % CHUNK_SIZE;
|
||||
// The last entry of `chunk_idx_map` actually should be at `last_chunk_idx`,
|
||||
// so we need to remap it
|
||||
let chunk_idx = if chunk_map_idx < (chunk_idx_map.len() - 1) {
|
||||
chunk_idx_map[chunk_map_idx]
|
||||
} else if chunk_map_idx == last_chunk_idx as usize {
|
||||
chunk_idx_map[chunk_idx_map.len() - 1]
|
||||
let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) {
|
||||
v
|
||||
} else {
|
||||
return false;
|
||||
};
|
||||
|
@ -317,12 +312,12 @@ pub mod grapheme_extend {
|
|||
|
||||
#[rustfmt::skip]
|
||||
pub mod lowercase {
|
||||
const BITSET_LAST_CHUNK_MAP: u16 = 122;
|
||||
static BITSET_CHUNKS_MAP: [u8; 119] = [
|
||||
static BITSET_CHUNKS_MAP: [u8; 123] = [
|
||||
13, 16, 0, 0, 8, 0, 0, 11, 12, 9, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 3, 1, 0, 14, 0, 7, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 6,
|
||||
0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0,
|
||||
0, 0, 6,
|
||||
];
|
||||
static BITSET_INDEX_CHUNKS: [[u8; 16]; 18] = [
|
||||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
|
@ -408,7 +403,6 @@ pub mod lowercase {
|
|||
super::bitset_search(
|
||||
c as u32,
|
||||
&BITSET_CHUNKS_MAP,
|
||||
BITSET_LAST_CHUNK_MAP,
|
||||
&BITSET_INDEX_CHUNKS,
|
||||
&BITSET_CANONICAL,
|
||||
&BITSET_MAPPING,
|
||||
|
@ -449,13 +443,12 @@ pub mod n {
|
|||
|
||||
#[rustfmt::skip]
|
||||
pub mod uppercase {
|
||||
const BITSET_LAST_CHUNK_MAP: u16 = 124;
|
||||
static BITSET_CHUNKS_MAP: [u8; 124] = [
|
||||
static BITSET_CHUNKS_MAP: [u8; 125] = [
|
||||
12, 15, 5, 5, 0, 5, 5, 2, 4, 11, 5, 14, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||||
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||||
5, 5, 5, 6, 5, 13, 5, 10, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||||
5, 7, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 16, 5, 5,
|
||||
5, 5, 9, 3,
|
||||
5, 5, 9, 5, 3,
|
||||
];
|
||||
static BITSET_INDEX_CHUNKS: [[u8; 16]; 17] = [
|
||||
[41, 41, 5, 33, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 5, 0],
|
||||
|
@ -529,7 +522,6 @@ pub mod uppercase {
|
|||
super::bitset_search(
|
||||
c as u32,
|
||||
&BITSET_CHUNKS_MAP,
|
||||
BITSET_LAST_CHUNK_MAP,
|
||||
&BITSET_INDEX_CHUNKS,
|
||||
&BITSET_CANONICAL,
|
||||
&BITSET_MAPPING,
|
||||
|
|
|
@ -8,7 +8,6 @@ fn bitset_search<
|
|||
>(
|
||||
needle: u32,
|
||||
chunk_idx_map: &[u8; N],
|
||||
last_chunk_idx: u16,
|
||||
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
|
||||
bitset_canonical: &[u64; CANONICAL],
|
||||
bitset_canonicalized: &[(u8, u8); CANONICALIZED],
|
||||
|
@ -16,12 +15,8 @@ fn bitset_search<
|
|||
let bucket_idx = (needle / 64) as usize;
|
||||
let chunk_map_idx = bucket_idx / CHUNK_SIZE;
|
||||
let chunk_piece = bucket_idx % CHUNK_SIZE;
|
||||
// The last entry of `chunk_idx_map` actually should be at `last_chunk_idx`,
|
||||
// so we need to remap it
|
||||
let chunk_idx = if chunk_map_idx < (chunk_idx_map.len() - 1) {
|
||||
chunk_idx_map[chunk_map_idx]
|
||||
} else if chunk_map_idx == last_chunk_idx as usize {
|
||||
chunk_idx_map[chunk_idx_map.len() - 1]
|
||||
let chunk_idx = if let Some(&v) = chunk_idx_map.get(chunk_map_idx) {
|
||||
v
|
||||
} else {
|
||||
return false;
|
||||
};
|
||||
|
|
|
@ -139,7 +139,6 @@ impl RawEmitter {
|
|||
writeln!(&mut self.file, " super::bitset_search(",).unwrap();
|
||||
writeln!(&mut self.file, " c as u32,").unwrap();
|
||||
writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
|
||||
writeln!(&mut self.file, " BITSET_LAST_CHUNK_MAP,").unwrap();
|
||||
writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();
|
||||
writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap();
|
||||
writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap();
|
||||
|
@ -170,29 +169,6 @@ impl RawEmitter {
|
|||
chunk_indices.push(chunk_map[chunk]);
|
||||
}
|
||||
|
||||
// If one of the chunks has all of the entries point to the bitset
|
||||
// word filled with zeros, then pop those off the end -- we know they
|
||||
// are useless.
|
||||
let zero_chunk_idx = chunks.iter().position(|chunk| chunk.iter().all(|e| *e == zero_at));
|
||||
while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx {
|
||||
chunk_indices.pop();
|
||||
}
|
||||
// We do not count the LAST_CHUNK_MAP as adding bytes because it's a
|
||||
// small constant whose values are inlined directly into the instruction
|
||||
// stream.
|
||||
writeln!(
|
||||
&mut self.file,
|
||||
"const BITSET_LAST_CHUNK_MAP: u16 = {};",
|
||||
chunk_indices.len() - 1,
|
||||
)
|
||||
.unwrap();
|
||||
let nonzero = chunk_indices.pop().unwrap();
|
||||
// Try to pop again, now that we've recorded a non-zero pointing index
|
||||
// into the LAST_CHUNK_MAP.
|
||||
while zero_chunk_idx.is_some() && chunk_indices.last().cloned() == zero_chunk_idx {
|
||||
chunk_indices.pop();
|
||||
}
|
||||
chunk_indices.push(nonzero);
|
||||
writeln!(
|
||||
&mut self.file,
|
||||
"static BITSET_CHUNKS_MAP: [u8; {}] = [{}];",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue