Remove dead control char logic
Only newlines and multibyte characters are actually relevant
This commit is contained in:
parent
3c7c38ad93
commit
d6ca7ad0d7
1 changed files with 47 additions and 112 deletions
|
@ -95,59 +95,32 @@ cfg_match! {
|
||||||
if multibyte_mask == 0 {
|
if multibyte_mask == 0 {
|
||||||
assert!(intra_chunk_offset == 0);
|
assert!(intra_chunk_offset == 0);
|
||||||
|
|
||||||
// Check if there are any control characters in the chunk. All
|
// Check for newlines in the chunk
|
||||||
// control characters that we can encounter at this point have a
|
let newlines_test = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)) };
|
||||||
// byte value less than 32 or ...
|
let mut newlines_mask = unsafe { _mm_movemask_epi8(newlines_test) };
|
||||||
let control_char_test0 = unsafe { _mm_cmplt_epi8(chunk, _mm_set1_epi8(32)) };
|
|
||||||
let control_char_mask0 = unsafe { _mm_movemask_epi8(control_char_test0) };
|
|
||||||
|
|
||||||
// ... it's the ASCII 'DEL' character with a value of 127.
|
let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
|
||||||
let control_char_test1 = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127)) };
|
|
||||||
let control_char_mask1 = unsafe { _mm_movemask_epi8(control_char_test1) };
|
|
||||||
|
|
||||||
let control_char_mask = control_char_mask0 | control_char_mask1;
|
while newlines_mask != 0 {
|
||||||
|
let index = newlines_mask.trailing_zeros();
|
||||||
|
|
||||||
if control_char_mask != 0 {
|
lines.push(RelativeBytePos(index) + output_offset);
|
||||||
// Check for newlines in the chunk
|
|
||||||
let newlines_test = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)) };
|
|
||||||
let mut newlines_mask = unsafe { _mm_movemask_epi8(newlines_test) };
|
|
||||||
|
|
||||||
if control_char_mask == newlines_mask {
|
// Clear the bit, so we can find the next one.
|
||||||
// All control characters are newlines, record them
|
newlines_mask &= newlines_mask - 1;
|
||||||
let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
|
|
||||||
|
|
||||||
while newlines_mask != 0 {
|
|
||||||
let index = newlines_mask.trailing_zeros();
|
|
||||||
|
|
||||||
lines.push(RelativeBytePos(index) + output_offset);
|
|
||||||
|
|
||||||
// Clear the bit, so we can find the next one.
|
|
||||||
newlines_mask &= newlines_mask - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// We are done for this chunk. All control characters were
|
|
||||||
// newlines and we took care of those.
|
|
||||||
continue;
|
|
||||||
} else {
|
|
||||||
// Some of the control characters are not newlines,
|
|
||||||
// fall through to the slow path below.
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// No control characters, nothing to record for this chunk
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// The slow path.
|
||||||
|
// There are multibyte chars in here, fallback to generic decoding.
|
||||||
|
let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
|
||||||
|
intra_chunk_offset = analyze_source_file_generic(
|
||||||
|
&src[scan_start..],
|
||||||
|
CHUNK_SIZE - intra_chunk_offset,
|
||||||
|
RelativeBytePos::from_usize(scan_start),
|
||||||
|
lines,
|
||||||
|
multi_byte_chars,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// The slow path.
|
|
||||||
// There are control chars in here, fallback to generic decoding.
|
|
||||||
let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
|
|
||||||
intra_chunk_offset = analyze_source_file_generic(
|
|
||||||
&src[scan_start..],
|
|
||||||
CHUNK_SIZE - intra_chunk_offset,
|
|
||||||
RelativeBytePos::from_usize(scan_start),
|
|
||||||
lines,
|
|
||||||
multi_byte_chars,
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// There might still be a tail left to analyze
|
// There might still be a tail left to analyze
|
||||||
|
@ -247,59 +220,32 @@ cfg_match! {
|
||||||
if multibyte_mask == 0 {
|
if multibyte_mask == 0 {
|
||||||
assert!(intra_chunk_offset == 0);
|
assert!(intra_chunk_offset == 0);
|
||||||
|
|
||||||
// Check if there are any control characters in the chunk. All
|
// Check for newlines in the chunk
|
||||||
// control characters that we can encounter at this point have a
|
let newlines_test = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)) };
|
||||||
// byte value less than 32 or ...
|
let mut newlines_mask = unsafe { _mm_movemask_epi8(newlines_test) };
|
||||||
let control_char_test0 = unsafe { _mm_cmplt_epi8(chunk, _mm_set1_epi8(32)) };
|
|
||||||
let control_char_mask0 = unsafe { _mm_movemask_epi8(control_char_test0) };
|
|
||||||
|
|
||||||
// ... it's the ASCII 'DEL' character with a value of 127.
|
let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
|
||||||
let control_char_test1 = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127)) };
|
|
||||||
let control_char_mask1 = unsafe { _mm_movemask_epi8(control_char_test1) };
|
|
||||||
|
|
||||||
let control_char_mask = control_char_mask0 | control_char_mask1;
|
while newlines_mask != 0 {
|
||||||
|
let index = newlines_mask.trailing_zeros();
|
||||||
|
|
||||||
if control_char_mask != 0 {
|
lines.push(RelativeBytePos(index) + output_offset);
|
||||||
// Check for newlines in the chunk
|
|
||||||
let newlines_test = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)) };
|
|
||||||
let mut newlines_mask = unsafe { _mm_movemask_epi8(newlines_test) };
|
|
||||||
|
|
||||||
if control_char_mask == newlines_mask {
|
// Clear the bit, so we can find the next one.
|
||||||
// All control characters are newlines, record them
|
newlines_mask &= newlines_mask - 1;
|
||||||
let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
|
|
||||||
|
|
||||||
while newlines_mask != 0 {
|
|
||||||
let index = newlines_mask.trailing_zeros();
|
|
||||||
|
|
||||||
lines.push(RelativeBytePos(index) + output_offset);
|
|
||||||
|
|
||||||
// Clear the bit, so we can find the next one.
|
|
||||||
newlines_mask &= newlines_mask - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// We are done for this chunk. All control characters were
|
|
||||||
// newlines and we took care of those.
|
|
||||||
continue;
|
|
||||||
} else {
|
|
||||||
// Some of the control characters are not newlines,
|
|
||||||
// fall through to the slow path below.
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// No control characters, nothing to record for this chunk
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// The slow path.
|
||||||
|
// There are multibyte chars in here, fallback to generic decoding.
|
||||||
|
let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
|
||||||
|
intra_chunk_offset = analyze_source_file_generic(
|
||||||
|
&src[scan_start..],
|
||||||
|
CHUNK_SIZE - intra_chunk_offset,
|
||||||
|
RelativeBytePos::from_usize(scan_start),
|
||||||
|
lines,
|
||||||
|
multi_byte_chars,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// The slow path.
|
|
||||||
// There are control chars in here, fallback to generic decoding.
|
|
||||||
let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
|
|
||||||
intra_chunk_offset = analyze_source_file_generic(
|
|
||||||
&src[scan_start..],
|
|
||||||
CHUNK_SIZE - intra_chunk_offset,
|
|
||||||
RelativeBytePos::from_usize(scan_start),
|
|
||||||
lines,
|
|
||||||
multi_byte_chars,
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// There might still be a tail left to analyze
|
// There might still be a tail left to analyze
|
||||||
|
@ -357,29 +303,18 @@ fn analyze_source_file_generic(
|
||||||
// string.
|
// string.
|
||||||
let mut char_len = 1;
|
let mut char_len = 1;
|
||||||
|
|
||||||
if byte < 32 {
|
if byte == b'\n' {
|
||||||
// This is an ASCII control character, it could be one of the cases
|
|
||||||
// that are interesting to us.
|
|
||||||
|
|
||||||
let pos = RelativeBytePos::from_usize(i) + output_offset;
|
let pos = RelativeBytePos::from_usize(i) + output_offset;
|
||||||
|
lines.push(pos + RelativeBytePos(1));
|
||||||
if let b'\n' = byte {
|
} else if byte >= 128 {
|
||||||
lines.push(pos + RelativeBytePos(1));
|
// This is the beginning of a multibyte char. Just decode to `char`.
|
||||||
}
|
|
||||||
} else if byte >= 127 {
|
|
||||||
// The slow path:
|
|
||||||
// This is either ASCII control character "DEL" or the beginning of
|
|
||||||
// a multibyte char. Just decode to `char`.
|
|
||||||
let c = src[i..].chars().next().unwrap();
|
let c = src[i..].chars().next().unwrap();
|
||||||
char_len = c.len_utf8();
|
char_len = c.len_utf8();
|
||||||
|
|
||||||
let pos = RelativeBytePos::from_usize(i) + output_offset;
|
let pos = RelativeBytePos::from_usize(i) + output_offset;
|
||||||
|
assert!((2..=4).contains(&char_len));
|
||||||
if char_len > 1 {
|
let mbc = MultiByteChar { pos, bytes: char_len as u8 };
|
||||||
assert!((2..=4).contains(&char_len));
|
multi_byte_chars.push(mbc);
|
||||||
let mbc = MultiByteChar { pos, bytes: char_len as u8 };
|
|
||||||
multi_byte_chars.push(mbc);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
i += char_len;
|
i += char_len;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue