1
Fork 0

Make non-ASCII errors more consistent.

There are three kinds of "byte" literals: byte literals, byte string
literals, and raw byte string literals. None are allowed to have
non-ASCII chars in them.

Two `EscapeError` variants exist for when that constraint is violated.
- `NonAsciiCharInByte`: used for byte literals and byte string literals.
- `NonAsciiCharInByteString`: used for raw byte string literals.

As a result, the messages for raw byte string literals use different
wording, without good reason. Also, byte string literals are incorrectly
described as "byte constants" in some error messages.

This commit eliminates `NonAsciiCharInByteString` so the three cases are
handled similarly, and described correctly. The `mode` is enough to
distinguish them.

Note: Some existing error messages mention "byte constants" and some
mention "byte literals". I went with the latter here, because it's a
more correct name, as used by the Reference.
This commit is contained in:
Nicholas Nethercote 2022-11-03 15:17:37 +11:00
parent 34b32b0dac
commit 7dbf2c0ed8
15 changed files with 62 additions and 74 deletions

View file

@ -52,10 +52,8 @@ pub enum EscapeError {
/// Unicode escape code in byte literal. /// Unicode escape code in byte literal.
UnicodeEscapeInByte, UnicodeEscapeInByte,
/// Non-ascii character in byte literal. /// Non-ascii character in byte literal, byte string literal, or raw byte string literal.
NonAsciiCharInByte, NonAsciiCharInByte,
/// Non-ascii character in byte string literal.
NonAsciiCharInByteString,
/// After a line ending with '\', the next line contains whitespace /// After a line ending with '\', the next line contains whitespace
/// characters that are not skipped. /// characters that are not skipped.
@ -349,8 +347,7 @@ where
let start = src.len() - chars.as_str().len() - c.len_utf8(); let start = src.len() - chars.as_str().len() - c.len_utf8();
let result = match c { let result = match c {
'\r' => Err(EscapeError::BareCarriageReturnInRawString), '\r' => Err(EscapeError::BareCarriageReturnInRawString),
c if is_byte && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString), _ => ascii_check(c, is_byte),
c => Ok(c),
}; };
let end = src.len() - chars.as_str().len(); let end = src.len() - chars.as_str().len();
callback(start..end, result); callback(start..end, result);

View file

@ -289,9 +289,6 @@ fn test_unescape_raw_byte_str() {
} }
check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]); check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByte))]);
check( check("🦀a", &[(0..4, Err(EscapeError::NonAsciiCharInByte)), (4..5, Ok(byte_from_char('a')))]);
"🦀a",
&[(0..4, Err(EscapeError::NonAsciiCharInByteString)), (4..5, Ok(byte_from_char('a')))],
);
} }

View file

@ -231,16 +231,23 @@ pub(crate) fn emit_unescape_error(
.emit(); .emit();
} }
EscapeError::NonAsciiCharInByte => { EscapeError::NonAsciiCharInByte => {
assert!(mode.is_byte());
let (c, span) = last_char(); let (c, span) = last_char();
let mut err = handler.struct_span_err(span, "non-ASCII character in byte constant"); let desc = match mode {
Mode::Byte => "byte literal",
Mode::ByteStr => "byte string literal",
Mode::RawByteStr => "raw byte string literal",
_ => panic!("non-is_byte literal paired with NonAsciiCharInByte"),
};
let mut err = handler.struct_span_err(span, format!("non-ASCII character in {}", desc));
let postfix = if unicode_width::UnicodeWidthChar::width(c).unwrap_or(1) == 0 { let postfix = if unicode_width::UnicodeWidthChar::width(c).unwrap_or(1) == 0 {
format!(" but is {:?}", c) format!(" but is {:?}", c)
} else { } else {
String::new() String::new()
}; };
err.span_label(span, &format!("byte constant must be ASCII{}", postfix)); err.span_label(span, &format!("must be ASCII{}", postfix));
if (c as u32) <= 0xFF { // Note: the \\xHH suggestions are not given for raw byte string
// literals, because they are araw and so cannot use any escapes.
if (c as u32) <= 0xFF && mode != Mode::RawByteStr {
err.span_suggestion( err.span_suggestion(
span, span,
&format!( &format!(
@ -250,9 +257,9 @@ pub(crate) fn emit_unescape_error(
format!("\\x{:X}", c as u32), format!("\\x{:X}", c as u32),
Applicability::MaybeIncorrect, Applicability::MaybeIncorrect,
); );
} else if matches!(mode, Mode::Byte) { } else if mode == Mode::Byte {
err.span_label(span, "this multibyte character does not fit into a single byte"); err.span_label(span, "this multibyte character does not fit into a single byte");
} else if matches!(mode, Mode::ByteStr) { } else if mode != Mode::RawByteStr {
let mut utf8 = String::new(); let mut utf8 = String::new();
utf8.push(c); utf8.push(c);
err.span_suggestion( err.span_suggestion(
@ -270,19 +277,6 @@ pub(crate) fn emit_unescape_error(
} }
err.emit(); err.emit();
} }
EscapeError::NonAsciiCharInByteString => {
assert!(mode.is_byte());
let (c, span) = last_char();
let postfix = if unicode_width::UnicodeWidthChar::width(c).unwrap_or(1) == 0 {
format!(" but is {:?}", c)
} else {
String::new()
};
handler
.struct_span_err(span, "raw byte string must be ASCII")
.span_label(span, &format!("must be ASCII{}", postfix))
.emit();
}
EscapeError::OutOfRangeHexEscape => { EscapeError::OutOfRangeHexEscape => {
handler handler
.struct_span_err(span, "out of range hex escape") .struct_span_err(span, "out of range hex escape")

View file

@ -1,4 +1,4 @@
#![feature(rustc_attrs)] #![feature(rustc_attrs)]
#[rustc_dummy = b"ffi.rs"] //~ ERROR non-ASCII character in byte constant #[rustc_dummy = b"ffi.rs"] //~ ERROR non-ASCII character in byte string literal
fn main() {} fn main() {}

View file

@ -1,8 +1,8 @@
error: non-ASCII character in byte constant error: non-ASCII character in byte string literal
--> $DIR/key-value-non-ascii.rs:3:19 --> $DIR/key-value-non-ascii.rs:3:19
| |
LL | #[rustc_dummy = b"ffi.rs"] LL | #[rustc_dummy = b"ffi.rs"]
| ^ byte constant must be ASCII | ^ must be ASCII
| |
help: if you meant to use the UTF-8 encoding of 'ffi', use \xHH escapes help: if you meant to use the UTF-8 encoding of 'ffi', use \xHH escapes
| |

View file

@ -7,6 +7,6 @@ pub fn main() {
b'\x0Z'; //~ ERROR invalid character in numeric character escape: `Z` b'\x0Z'; //~ ERROR invalid character in numeric character escape: `Z`
b' '; //~ ERROR byte constant must be escaped b' '; //~ ERROR byte constant must be escaped
b'''; //~ ERROR byte constant must be escaped b'''; //~ ERROR byte constant must be escaped
b'é'; //~ ERROR non-ASCII character in byte constant b'é'; //~ ERROR non-ASCII character in byte literal
b'a //~ ERROR unterminated byte constant [E0763] b'a //~ ERROR unterminated byte constant [E0763]
} }

View file

@ -32,11 +32,11 @@ error: byte constant must be escaped: `'`
LL | b'''; LL | b''';
| ^ help: escape the character: `\'` | ^ help: escape the character: `\'`
error: non-ASCII character in byte constant error: non-ASCII character in byte literal
--> $DIR/byte-literals.rs:10:7 --> $DIR/byte-literals.rs:10:7
| |
LL | b'é'; LL | b'é';
| ^ byte constant must be ASCII | ^ must be ASCII
| |
help: if you meant to use the unicode code point for 'é', use a \xHH escape help: if you meant to use the unicode code point for 'é', use a \xHH escape
| |

View file

@ -3,7 +3,7 @@ static FOO: &'static [u8] = b"\f"; //~ ERROR unknown byte escape
pub fn main() { pub fn main() {
b"\f"; //~ ERROR unknown byte escape b"\f"; //~ ERROR unknown byte escape
b"\x0Z"; //~ ERROR invalid character in numeric character escape: `Z` b"\x0Z"; //~ ERROR invalid character in numeric character escape: `Z`
b"é"; //~ ERROR non-ASCII character in byte constant b"é"; //~ ERROR non-ASCII character in byte string literal
br##"é"##; //~ ERROR raw byte string must be ASCII br##"é"##; //~ ERROR non-ASCII character in raw byte string literal
b"a //~ ERROR unterminated double quote byte string b"a //~ ERROR unterminated double quote byte string
} }

View file

@ -20,18 +20,18 @@ error: invalid character in numeric character escape: `Z`
LL | b"\x0Z"; LL | b"\x0Z";
| ^ invalid character in numeric character escape | ^ invalid character in numeric character escape
error: non-ASCII character in byte constant error: non-ASCII character in byte string literal
--> $DIR/byte-string-literals.rs:6:7 --> $DIR/byte-string-literals.rs:6:7
| |
LL | b"é"; LL | b"é";
| ^ byte constant must be ASCII | ^ must be ASCII
| |
help: if you meant to use the unicode code point for 'é', use a \xHH escape help: if you meant to use the unicode code point for 'é', use a \xHH escape
| |
LL | b"\xE9"; LL | b"\xE9";
| ~~~~ | ~~~~
error: raw byte string must be ASCII error: non-ASCII character in raw byte string literal
--> $DIR/byte-string-literals.rs:7:10 --> $DIR/byte-string-literals.rs:7:10
| |
LL | br##"é"##; LL | br##"é"##;

View file

@ -2,6 +2,6 @@
pub fn main() { pub fn main() {
br"a "; //~ ERROR bare CR not allowed in raw string br"a "; //~ ERROR bare CR not allowed in raw string
br"é"; //~ ERROR raw byte string must be ASCII br"é"; //~ ERROR non-ASCII character in raw byte string literal
br##~"a"~##; //~ ERROR only `#` is allowed in raw string delimitation br##~"a"~##; //~ ERROR only `#` is allowed in raw string delimitation
} }

View file

@ -4,7 +4,7 @@ error: bare CR not allowed in raw string
LL | br"a "; LL | br"a ";
| ^ | ^
error: raw byte string must be ASCII error: non-ASCII character in raw byte string literal
--> $DIR/raw-byte-string-literals.rs:5:8 --> $DIR/raw-byte-string-literals.rs:5:8
| |
LL | br"é"; LL | br"é";

View file

@ -14,15 +14,15 @@ fn main() {
println!("{:?}", r##"/* } if isAdmin begin admins only "##); println!("{:?}", r##"/* } if isAdmin begin admins only "##);
//~^ ERROR unicode codepoint changing visible direction of text present in literal //~^ ERROR unicode codepoint changing visible direction of text present in literal
println!("{:?}", b"/* } if isAdmin begin admins only "); println!("{:?}", b"/* } if isAdmin begin admins only ");
//~^ ERROR non-ASCII character in byte constant //~^ ERROR non-ASCII character in byte string literal
//~| ERROR non-ASCII character in byte constant //~| ERROR non-ASCII character in byte string literal
//~| ERROR non-ASCII character in byte constant //~| ERROR non-ASCII character in byte string literal
//~| ERROR non-ASCII character in byte constant //~| ERROR non-ASCII character in byte string literal
println!("{:?}", br##"/* } if isAdmin begin admins only "##); println!("{:?}", br##"/* } if isAdmin begin admins only "##);
//~^ ERROR raw byte string must be ASCII //~^ ERROR non-ASCII character in raw byte string literal
//~| ERROR raw byte string must be ASCII //~| ERROR non-ASCII character in raw byte string literal
//~| ERROR raw byte string must be ASCII //~| ERROR non-ASCII character in raw byte string literal
//~| ERROR raw byte string must be ASCII //~| ERROR non-ASCII character in raw byte string literal
println!("{:?}", ''); println!("{:?}", '');
//~^ ERROR unicode codepoint changing visible direction of text present in literal //~^ ERROR unicode codepoint changing visible direction of text present in literal
} }

View file

@ -14,69 +14,69 @@ LL | println!("{:?}", b"us\u{202B}e\u{202A}r");
| |
= help: unicode escape sequences cannot be used as a byte or in a byte string = help: unicode escape sequences cannot be used as a byte or in a byte string
error: non-ASCII character in byte constant error: non-ASCII character in byte string literal
--> $DIR/unicode-control-codepoints.rs:16:26 --> $DIR/unicode-control-codepoints.rs:16:26
| |
LL | println!("{:?}", b"/* } if isAdmin begin admins only "); LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
| ^ byte constant must be ASCII but is '\u{202e}' | ^ must be ASCII but is '\u{202e}'
| |
help: if you meant to use the UTF-8 encoding of '\u{202e}', use \xHH escapes help: if you meant to use the UTF-8 encoding of '\u{202e}', use \xHH escapes
| |
LL | println!("{:?}", b"/*\xE2\x80\xAE } if isAdmin begin admins only "); LL | println!("{:?}", b"/*\xE2\x80\xAE } if isAdmin begin admins only ");
| ~~~~~~~~~~~~ | ~~~~~~~~~~~~
error: non-ASCII character in byte constant error: non-ASCII character in byte string literal
--> $DIR/unicode-control-codepoints.rs:16:30 --> $DIR/unicode-control-codepoints.rs:16:30
| |
LL | println!("{:?}", b"/* } if isAdmin begin admins only "); LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
| ^ byte constant must be ASCII but is '\u{2066}' | ^ must be ASCII but is '\u{2066}'
| |
help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes
| |
LL | println!("{:?}", b"/* } \xE2\x81\xA6if isAdmin begin admins only "); LL | println!("{:?}", b"/* } \xE2\x81\xA6if isAdmin begin admins only ");
| ~~~~~~~~~~~~ | ~~~~~~~~~~~~
error: non-ASCII character in byte constant error: non-ASCII character in byte string literal
--> $DIR/unicode-control-codepoints.rs:16:41 --> $DIR/unicode-control-codepoints.rs:16:41
| |
LL | println!("{:?}", b"/* } if isAdmin begin admins only "); LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
| ^ byte constant must be ASCII but is '\u{2069}' | ^ must be ASCII but is '\u{2069}'
| |
help: if you meant to use the UTF-8 encoding of '\u{2069}', use \xHH escapes help: if you meant to use the UTF-8 encoding of '\u{2069}', use \xHH escapes
| |
LL | println!("{:?}", b"/* } if isAdmin\xE2\x81\xA9 begin admins only "); LL | println!("{:?}", b"/* } if isAdmin\xE2\x81\xA9 begin admins only ");
| ~~~~~~~~~~~~ | ~~~~~~~~~~~~
error: non-ASCII character in byte constant error: non-ASCII character in byte string literal
--> $DIR/unicode-control-codepoints.rs:16:43 --> $DIR/unicode-control-codepoints.rs:16:43
| |
LL | println!("{:?}", b"/* } if isAdmin begin admins only "); LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
| ^ byte constant must be ASCII but is '\u{2066}' | ^ must be ASCII but is '\u{2066}'
| |
help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes
| |
LL | println!("{:?}", b"/* } if isAdmin \xE2\x81\xA6 begin admins only "); LL | println!("{:?}", b"/* } if isAdmin \xE2\x81\xA6 begin admins only ");
| ~~~~~~~~~~~~ | ~~~~~~~~~~~~
error: raw byte string must be ASCII error: non-ASCII character in raw byte string literal
--> $DIR/unicode-control-codepoints.rs:21:29 --> $DIR/unicode-control-codepoints.rs:21:29
| |
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
| ^ must be ASCII but is '\u{202e}' | ^ must be ASCII but is '\u{202e}'
error: raw byte string must be ASCII error: non-ASCII character in raw byte string literal
--> $DIR/unicode-control-codepoints.rs:21:33 --> $DIR/unicode-control-codepoints.rs:21:33
| |
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
| ^ must be ASCII but is '\u{2066}' | ^ must be ASCII but is '\u{2066}'
error: raw byte string must be ASCII error: non-ASCII character in raw byte string literal
--> $DIR/unicode-control-codepoints.rs:21:44 --> $DIR/unicode-control-codepoints.rs:21:44
| |
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
| ^ must be ASCII but is '\u{2069}' | ^ must be ASCII but is '\u{2069}'
error: raw byte string must be ASCII error: non-ASCII character in raw byte string literal
--> $DIR/unicode-control-codepoints.rs:21:46 --> $DIR/unicode-control-codepoints.rs:21:46
| |
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);

View file

@ -2,17 +2,17 @@
fn main() { fn main() {
b'µ'; b'µ';
//~^ ERROR: non-ASCII character in byte constant //~^ ERROR: non-ASCII character in byte literal
//~| HELP: if you meant to use the unicode code point for 'µ', use a \xHH escape //~| HELP: if you meant to use the unicode code point for 'µ', use a \xHH escape
//~| NOTE: byte constant must be ASCII //~| NOTE: must be ASCII
b'字'; b'字';
//~^ ERROR: non-ASCII character in byte constant //~^ ERROR: non-ASCII character in byte literal
//~| NOTE: this multibyte character does not fit into a single byte //~| NOTE: this multibyte character does not fit into a single byte
//~| NOTE: byte constant must be ASCII //~| NOTE: must be ASCII
b""; b"";
//~^ ERROR: non-ASCII character in byte constant //~^ ERROR: non-ASCII character in byte string literal
//~| HELP: if you meant to use the UTF-8 encoding of '字', use \xHH escapes //~| HELP: if you meant to use the UTF-8 encoding of '字', use \xHH escapes
//~| NOTE: byte constant must be ASCII //~| NOTE: must be ASCII
} }

View file

@ -1,28 +1,28 @@
error: non-ASCII character in byte constant error: non-ASCII character in byte literal
--> $DIR/multibyte-escapes.rs:4:7 --> $DIR/multibyte-escapes.rs:4:7
| |
LL | b'µ'; LL | b'µ';
| ^ byte constant must be ASCII | ^ must be ASCII
| |
help: if you meant to use the unicode code point for 'µ', use a \xHH escape help: if you meant to use the unicode code point for 'µ', use a \xHH escape
| |
LL | b'\xB5'; LL | b'\xB5';
| ~~~~ | ~~~~
error: non-ASCII character in byte constant error: non-ASCII character in byte literal
--> $DIR/multibyte-escapes.rs:9:7 --> $DIR/multibyte-escapes.rs:9:7
| |
LL | b'字'; LL | b'字';
| ^^ | ^^
| | | |
| byte constant must be ASCII | must be ASCII
| this multibyte character does not fit into a single byte | this multibyte character does not fit into a single byte
error: non-ASCII character in byte constant error: non-ASCII character in byte string literal
--> $DIR/multibyte-escapes.rs:14:7 --> $DIR/multibyte-escapes.rs:14:7
| |
LL | b"字"; LL | b"字";
| ^^ byte constant must be ASCII | ^^ must be ASCII
| |
help: if you meant to use the UTF-8 encoding of '字', use \xHH escapes help: if you meant to use the UTF-8 encoding of '字', use \xHH escapes
| |