Rollup merge of #107058 - clubby789:eqeq-homoglyph, r=wesleywiser

Recognise double-equals homoglyph

Recognise `⩵` as a homoglyph for `==`.

The first commit switches `char` to `&str`, as all previous homoglyphs corresponded to a single ASCII character, while the second implements the fix.

`@rustbot` label +A-diagnostics +A-parser
This commit is contained in:
Matthias Krüger 2023-01-20 07:16:10 +01:00 committed by GitHub
commit 3693399ffc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 313 additions and 297 deletions

View file

@ -7,329 +7,331 @@ use rustc_errors::{Applicability, Diagnostic};
use rustc_span::{symbol::kw, BytePos, Pos, Span}; use rustc_span::{symbol::kw, BytePos, Pos, Span};
#[rustfmt::skip] // for line breaks #[rustfmt::skip] // for line breaks
pub(crate) const UNICODE_ARRAY: &[(char, &str, char)] = &[ pub(crate) const UNICODE_ARRAY: &[(char, &str, &str)] = &[
('', "Line Separator", ' '), ('', "Line Separator", " "),
('', "Paragraph Separator", ' '), ('', "Paragraph Separator", " "),
('', "Ogham Space mark", ' '), ('', "Ogham Space mark", " "),
(' ', "En Quad", ' '), (' ', "En Quad", " "),
('', "Em Quad", ' '), ('', "Em Quad", " "),
('', "En Space", ' '), ('', "En Space", " "),
('', "Em Space", ' '), ('', "Em Space", " "),
('', "Three-Per-Em Space", ' '), ('', "Three-Per-Em Space", " "),
('', "Four-Per-Em Space", ' '), ('', "Four-Per-Em Space", " "),
('', "Six-Per-Em Space", ' '), ('', "Six-Per-Em Space", " "),
('', "Punctuation Space", ' '), ('', "Punctuation Space", " "),
('', "Thin Space", ' '), ('', "Thin Space", " "),
('', "Hair Space", ' '), ('', "Hair Space", " "),
('', "Medium Mathematical Space", ' '), ('', "Medium Mathematical Space", " "),
(' ', "No-Break Space", ' '), (' ', "No-Break Space", " "),
('', "Figure Space", ' '), ('', "Figure Space", " "),
('', "Narrow No-Break Space", ' '), ('', "Narrow No-Break Space", " "),
(' ', "Ideographic Space", ' '), (' ', "Ideographic Space", " "),
('ߺ', "Nko Lajanyalan", '_'), ('ߺ', "Nko Lajanyalan", "_"),
('', "Dashed Low Line", '_'), ('', "Dashed Low Line", "_"),
('', "Centreline Low Line", '_'), ('', "Centreline Low Line", "_"),
('', "Wavy Low Line", '_'), ('', "Wavy Low Line", "_"),
('_', "Fullwidth Low Line", '_'), ('_', "Fullwidth Low Line", "_"),
('', "Hyphen", '-'), ('', "Hyphen", "-"),
('', "Non-Breaking Hyphen", '-'), ('', "Non-Breaking Hyphen", "-"),
('', "Figure Dash", '-'), ('', "Figure Dash", "-"),
('', "En Dash", '-'), ('', "En Dash", "-"),
('—', "Em Dash", '-'), ('—', "Em Dash", "-"),
('', "Small Em Dash", '-'), ('', "Small Em Dash", "-"),
('۔', "Arabic Full Stop", '-'), ('۔', "Arabic Full Stop", "-"),
('', "Hyphen Bullet", '-'), ('', "Hyphen Bullet", "-"),
('˗', "Modifier Letter Minus Sign", '-'), ('˗', "Modifier Letter Minus Sign", "-"),
('', "Minus Sign", '-'), ('', "Minus Sign", "-"),
('', "Heavy Minus Sign", '-'), ('', "Heavy Minus Sign", "-"),
('', "Coptic Letter Dialect-P Ni", '-'), ('', "Coptic Letter Dialect-P Ni", "-"),
('ー', "Katakana-Hiragana Prolonged Sound Mark", '-'), ('ー', "Katakana-Hiragana Prolonged Sound Mark", "-"),
('', "Fullwidth Hyphen-Minus", '-'), ('', "Fullwidth Hyphen-Minus", "-"),
('―', "Horizontal Bar", '-'), ('―', "Horizontal Bar", "-"),
('─', "Box Drawings Light Horizontal", '-'), ('─', "Box Drawings Light Horizontal", "-"),
('━', "Box Drawings Heavy Horizontal", '-'), ('━', "Box Drawings Heavy Horizontal", "-"),
('㇐', "CJK Stroke H", '-'), ('㇐', "CJK Stroke H", "-"),
('ꟷ', "Latin Epigraphic Letter Sideways I", '-'), ('ꟷ', "Latin Epigraphic Letter Sideways I", "-"),
('ᅳ', "Hangul Jungseong Eu", '-'), ('ᅳ', "Hangul Jungseong Eu", "-"),
('ㅡ', "Hangul Letter Eu", '-'), ('ㅡ', "Hangul Letter Eu", "-"),
('一', "CJK Unified Ideograph-4E00", '-'), ('一', "CJK Unified Ideograph-4E00", "-"),
('⼀', "Kangxi Radical One", '-'), ('⼀', "Kangxi Radical One", "-"),
('؍', "Arabic Date Separator", ','), ('؍', "Arabic Date Separator", ","),
('٫', "Arabic Decimal Separator", ','), ('٫', "Arabic Decimal Separator", ","),
('', "Single Low-9 Quotation Mark", ','), ('', "Single Low-9 Quotation Mark", ","),
('¸', "Cedilla", ','), ('¸', "Cedilla", ","),
('', "Lisu Letter Tone Na Po", ','), ('', "Lisu Letter Tone Na Po", ","),
('', "Fullwidth Comma", ','), ('', "Fullwidth Comma", ","),
(';', "Greek Question Mark", ';'), (';', "Greek Question Mark", ";"),
('', "Fullwidth Semicolon", ';'), ('', "Fullwidth Semicolon", ";"),
('︔', "Presentation Form For Vertical Semicolon", ';'), ('︔', "Presentation Form For Vertical Semicolon", ";"),
('', "Devanagari Sign Visarga", ':'), ('', "Devanagari Sign Visarga", ":"),
('', "Gujarati Sign Visarga", ':'), ('', "Gujarati Sign Visarga", ":"),
('', "Fullwidth Colon", ':'), ('', "Fullwidth Colon", ":"),
('։', "Armenian Full Stop", ':'), ('։', "Armenian Full Stop", ":"),
('܃', "Syriac Supralinear Colon", ':'), ('܃', "Syriac Supralinear Colon", ":"),
('܄', "Syriac Sublinear Colon", ':'), ('܄', "Syriac Sublinear Colon", ":"),
('', "Runic Multiple Punctuation", ':'), ('', "Runic Multiple Punctuation", ":"),
('', "Presentation Form For Vertical Two Dot Leader", ':'), ('', "Presentation Form For Vertical Two Dot Leader", ":"),
('', "Mongolian Full Stop", ':'), ('', "Mongolian Full Stop", ":"),
('', "Mongolian Manchu Full Stop", ':'), ('', "Mongolian Manchu Full Stop", ":"),
('', "Two Dot Punctuation", ':'), ('', "Two Dot Punctuation", ":"),
('׃', "Hebrew Punctuation Sof Pasuq", ':'), ('׃', "Hebrew Punctuation Sof Pasuq", ":"),
('˸', "Modifier Letter Raised Colon", ':'), ('˸', "Modifier Letter Raised Colon", ":"),
('', "Modifier Letter Colon", ':'), ('', "Modifier Letter Colon", ":"),
('', "Ratio", ':'), ('', "Ratio", ":"),
('ː', "Modifier Letter Triangular Colon", ':'), ('ː', "Modifier Letter Triangular Colon", ":"),
('', "Lisu Letter Tone Mya Jeu", ':'), ('', "Lisu Letter Tone Mya Jeu", ":"),
('︓', "Presentation Form For Vertical Colon", ':'), ('︓', "Presentation Form For Vertical Colon", ":"),
('', "Fullwidth Exclamation Mark", '!'), ('', "Fullwidth Exclamation Mark", "!"),
('ǃ', "Latin Letter Retroflex Click", '!'), ('ǃ', "Latin Letter Retroflex Click", "!"),
('', "Tifinagh Letter Tuareg Yang", '!'), ('', "Tifinagh Letter Tuareg Yang", "!"),
('︕', "Presentation Form For Vertical Exclamation Mark", '!'), ('︕', "Presentation Form For Vertical Exclamation Mark", "!"),
('ʔ', "Latin Letter Glottal Stop", '?'), ('ʔ', "Latin Letter Glottal Stop", "?"),
('Ɂ', "Latin Capital Letter Glottal Stop", '?'), ('Ɂ', "Latin Capital Letter Glottal Stop", "?"),
('', "Devanagari Letter Glottal Stop", '?'), ('', "Devanagari Letter Glottal Stop", "?"),
('', "Cherokee Letter He", '?'), ('', "Cherokee Letter He", "?"),
('', "Bamum Letter Ntuu", '?'), ('', "Bamum Letter Ntuu", "?"),
('', "Fullwidth Question Mark", '?'), ('', "Fullwidth Question Mark", "?"),
('︖', "Presentation Form For Vertical Question Mark", '?'), ('︖', "Presentation Form For Vertical Question Mark", "?"),
('𝅭', "Musical Symbol Combining Augmentation Dot", '.'), ('𝅭', "Musical Symbol Combining Augmentation Dot", "."),
('', "One Dot Leader", '.'), ('', "One Dot Leader", "."),
('܁', "Syriac Supralinear Full Stop", '.'), ('܁', "Syriac Supralinear Full Stop", "."),
('܂', "Syriac Sublinear Full Stop", '.'), ('܂', "Syriac Sublinear Full Stop", "."),
('', "Vai Full Stop", '.'), ('', "Vai Full Stop", "."),
('𐩐', "Kharoshthi Punctuation Dot", '.'), ('𐩐', "Kharoshthi Punctuation Dot", "."),
('٠', "Arabic-Indic Digit Zero", '.'), ('٠', "Arabic-Indic Digit Zero", "."),
('۰', "Extended Arabic-Indic Digit Zero", '.'), ('۰', "Extended Arabic-Indic Digit Zero", "."),
('', "Lisu Letter Tone Mya Ti", '.'), ('', "Lisu Letter Tone Mya Ti", "."),
('·', "Middle Dot", '.'), ('·', "Middle Dot", "."),
('・', "Katakana Middle Dot", '.'), ('・', "Katakana Middle Dot", "."),
('・', "Halfwidth Katakana Middle Dot", '.'), ('・', "Halfwidth Katakana Middle Dot", "."),
('᛫', "Runic Single Punctuation", '.'), ('᛫', "Runic Single Punctuation", "."),
('·', "Greek Ano Teleia", '.'), ('·', "Greek Ano Teleia", "."),
('⸱', "Word Separator Middle Dot", '.'), ('⸱', "Word Separator Middle Dot", "."),
('𐄁', "Aegean Word Separator Dot", '.'), ('𐄁', "Aegean Word Separator Dot", "."),
('•', "Bullet", '.'), ('•', "Bullet", "."),
('‧', "Hyphenation Point", '.'), ('‧', "Hyphenation Point", "."),
('∙', "Bullet Operator", '.'), ('∙', "Bullet Operator", "."),
('⋅', "Dot Operator", '.'), ('⋅', "Dot Operator", "."),
('ꞏ', "Latin Letter Sinological Dot", '.'), ('ꞏ', "Latin Letter Sinological Dot", "."),
('ᐧ', "Canadian Syllabics Final Middle Dot", '.'), ('ᐧ', "Canadian Syllabics Final Middle Dot", "."),
('ᐧ', "Canadian Syllabics Final Middle Dot", '.'), ('ᐧ', "Canadian Syllabics Final Middle Dot", "."),
('', "Fullwidth Full Stop", '.'), ('', "Fullwidth Full Stop", "."),
('。', "Ideographic Full Stop", '.'), ('。', "Ideographic Full Stop", "."),
('︒', "Presentation Form For Vertical Ideographic Full Stop", '.'), ('︒', "Presentation Form For Vertical Ideographic Full Stop", "."),
('՝', "Armenian Comma", '\''), ('՝', "Armenian Comma", "\'"),
('', "Fullwidth Apostrophe", '\''), ('', "Fullwidth Apostrophe", "\'"),
('', "Left Single Quotation Mark", '\''), ('', "Left Single Quotation Mark", "\'"),
('', "Right Single Quotation Mark", '\''), ('', "Right Single Quotation Mark", "\'"),
('', "Single High-Reversed-9 Quotation Mark", '\''), ('', "Single High-Reversed-9 Quotation Mark", "\'"),
('', "Prime", '\''), ('', "Prime", "\'"),
('', "Reversed Prime", '\''), ('', "Reversed Prime", "\'"),
('՚', "Armenian Apostrophe", '\''), ('՚', "Armenian Apostrophe", "\'"),
('׳', "Hebrew Punctuation Geresh", '\''), ('׳', "Hebrew Punctuation Geresh", "\'"),
('`', "Grave Accent", '\''), ('`', "Grave Accent", "\'"),
('', "Greek Varia", '\''), ('', "Greek Varia", "\'"),
('', "Fullwidth Grave Accent", '\''), ('', "Fullwidth Grave Accent", "\'"),
('´', "Acute Accent", '\''), ('´', "Acute Accent", "\'"),
('΄', "Greek Tonos", '\''), ('΄', "Greek Tonos", "\'"),
('', "Greek Oxia", '\''), ('', "Greek Oxia", "\'"),
('', "Greek Koronis", '\''), ('', "Greek Koronis", "\'"),
('᾿', "Greek Psili", '\''), ('᾿', "Greek Psili", "\'"),
('', "Greek Dasia", '\''), ('', "Greek Dasia", "\'"),
('ʹ', "Modifier Letter Prime", '\''), ('ʹ', "Modifier Letter Prime", "\'"),
('ʹ', "Greek Numeral Sign", '\''), ('ʹ', "Greek Numeral Sign", "\'"),
('ˈ', "Modifier Letter Vertical Line", '\''), ('ˈ', "Modifier Letter Vertical Line", "\'"),
('ˊ', "Modifier Letter Acute Accent", '\''), ('ˊ', "Modifier Letter Acute Accent", "\'"),
('ˋ', "Modifier Letter Grave Accent", '\''), ('ˋ', "Modifier Letter Grave Accent", "\'"),
('˴', "Modifier Letter Middle Grave Accent", '\''), ('˴', "Modifier Letter Middle Grave Accent", "\'"),
('ʻ', "Modifier Letter Turned Comma", '\''), ('ʻ', "Modifier Letter Turned Comma", "\'"),
('ʽ', "Modifier Letter Reversed Comma", '\''), ('ʽ', "Modifier Letter Reversed Comma", "\'"),
('ʼ', "Modifier Letter Apostrophe", '\''), ('ʼ', "Modifier Letter Apostrophe", "\'"),
('ʾ', "Modifier Letter Right Half Ring", '\''), ('ʾ', "Modifier Letter Right Half Ring", "\'"),
('', "Latin Small Letter Saltillo", '\''), ('', "Latin Small Letter Saltillo", "\'"),
('י', "Hebrew Letter Yod", '\''), ('י', "Hebrew Letter Yod", "\'"),
('ߴ', "Nko High Tone Apostrophe", '\''), ('ߴ', "Nko High Tone Apostrophe", "\'"),
('ߵ', "Nko Low Tone Apostrophe", '\''), ('ߵ', "Nko Low Tone Apostrophe", "\'"),
('', "Canadian Syllabics West-Cree P", '\''), ('', "Canadian Syllabics West-Cree P", "\'"),
('', "Runic Letter Short-Twig-Sol S", '\''), ('', "Runic Letter Short-Twig-Sol S", "\'"),
('𖽑', "Miao Sign Aspiration", '\''), ('𖽑', "Miao Sign Aspiration", "\'"),
('𖽒', "Miao Sign Reformed Voicing", '\''), ('𖽒', "Miao Sign Reformed Voicing", "\'"),
('᳓', "Vedic Sign Nihshvasa", '"'), ('᳓', "Vedic Sign Nihshvasa", "\""),
('', "Fullwidth Quotation Mark", '"'), ('', "Fullwidth Quotation Mark", "\""),
('“', "Left Double Quotation Mark", '"'), ('“', "Left Double Quotation Mark", "\""),
('”', "Right Double Quotation Mark", '"'), ('”', "Right Double Quotation Mark", "\""),
('‟', "Double High-Reversed-9 Quotation Mark", '"'), ('‟', "Double High-Reversed-9 Quotation Mark", "\""),
('″', "Double Prime", '"'), ('″', "Double Prime", "\""),
('‶', "Reversed Double Prime", '"'), ('‶', "Reversed Double Prime", "\""),
('〃', "Ditto Mark", '"'), ('〃', "Ditto Mark", "\""),
('״', "Hebrew Punctuation Gershayim", '"'), ('״', "Hebrew Punctuation Gershayim", "\""),
('˝', "Double Acute Accent", '"'), ('˝', "Double Acute Accent", "\""),
('ʺ', "Modifier Letter Double Prime", '"'), ('ʺ', "Modifier Letter Double Prime", "\""),
('˶', "Modifier Letter Middle Double Acute Accent", '"'), ('˶', "Modifier Letter Middle Double Acute Accent", "\""),
('˵', "Modifier Letter Middle Double Grave Accent", '"'), ('˵', "Modifier Letter Middle Double Grave Accent", "\""),
('ˮ', "Modifier Letter Double Apostrophe", '"'), ('ˮ', "Modifier Letter Double Apostrophe", "\""),
('ײ', "Hebrew Ligature Yiddish Double Yod", '"'), ('ײ', "Hebrew Ligature Yiddish Double Yod", "\""),
('❞', "Heavy Double Comma Quotation Mark Ornament", '"'), ('❞', "Heavy Double Comma Quotation Mark Ornament", "\""),
('❝', "Heavy Double Turned Comma Quotation Mark Ornament", '"'), ('❝', "Heavy Double Turned Comma Quotation Mark Ornament", "\""),
('', "Fullwidth Left Parenthesis", '('), ('', "Fullwidth Left Parenthesis", "("),
('', "Medium Left Parenthesis Ornament", '('), ('', "Medium Left Parenthesis Ornament", "("),
('', "Ornate Left Parenthesis", '('), ('', "Ornate Left Parenthesis", "("),
('', "Fullwidth Right Parenthesis", ')'), ('', "Fullwidth Right Parenthesis", ")"),
('', "Medium Right Parenthesis Ornament", ')'), ('', "Medium Right Parenthesis Ornament", ")"),
('﴿', "Ornate Right Parenthesis", ')'), ('﴿', "Ornate Right Parenthesis", ")"),
('', "Fullwidth Left Square Bracket", '['), ('', "Fullwidth Left Square Bracket", "["),
('', "Light Left Tortoise Shell Bracket Ornament", '['), ('', "Light Left Tortoise Shell Bracket Ornament", "["),
('「', "Left Corner Bracket", '['), ('「', "Left Corner Bracket", "["),
('『', "Left White Corner Bracket", '['), ('『', "Left White Corner Bracket", "["),
('【', "Left Black Lenticular Bracket", '['), ('【', "Left Black Lenticular Bracket", "["),
('', "Left Tortoise Shell Bracket", '['), ('', "Left Tortoise Shell Bracket", "["),
('〖', "Left White Lenticular Bracket", '['), ('〖', "Left White Lenticular Bracket", "["),
('〘', "Left White Tortoise Shell Bracket", '['), ('〘', "Left White Tortoise Shell Bracket", "["),
('〚', "Left White Square Bracket", '['), ('〚', "Left White Square Bracket", "["),
('', "Fullwidth Right Square Bracket", ']'), ('', "Fullwidth Right Square Bracket", "]"),
('', "Light Right Tortoise Shell Bracket Ornament", ']'), ('', "Light Right Tortoise Shell Bracket Ornament", "]"),
('」', "Right Corner Bracket", ']'), ('」', "Right Corner Bracket", "]"),
('』', "Right White Corner Bracket", ']'), ('』', "Right White Corner Bracket", "]"),
('】', "Right Black Lenticular Bracket", ']'), ('】', "Right Black Lenticular Bracket", "]"),
('', "Right Tortoise Shell Bracket", ']'), ('', "Right Tortoise Shell Bracket", "]"),
('〗', "Right White Lenticular Bracket", ']'), ('〗', "Right White Lenticular Bracket", "]"),
('〙', "Right White Tortoise Shell Bracket", ']'), ('〙', "Right White Tortoise Shell Bracket", "]"),
('〛', "Right White Square Bracket", ']'), ('〛', "Right White Square Bracket", "]"),
('', "Medium Left Curly Bracket Ornament", '{'), ('', "Medium Left Curly Bracket Ornament", "{"),
('𝄔', "Musical Symbol Brace", '{'), ('𝄔', "Musical Symbol Brace", "{"),
('', "Fullwidth Left Curly Bracket", '{'), ('', "Fullwidth Left Curly Bracket", "{"),
('', "Medium Right Curly Bracket Ornament", '}'), ('', "Medium Right Curly Bracket Ornament", "}"),
('', "Fullwidth Right Curly Bracket", '}'), ('', "Fullwidth Right Curly Bracket", "}"),
('', "Low Asterisk", '*'), ('', "Low Asterisk", "*"),
('٭', "Arabic Five Pointed Star", '*'), ('٭', "Arabic Five Pointed Star", "*"),
('', "Asterisk Operator", '*'), ('', "Asterisk Operator", "*"),
('𐌟', "Old Italic Letter Ess", '*'), ('𐌟', "Old Italic Letter Ess", "*"),
('', "Fullwidth Asterisk", '*'), ('', "Fullwidth Asterisk", "*"),
('', "Philippine Single Punctuation", '/'), ('', "Philippine Single Punctuation", "/"),
('', "Caret Insertion Point", '/'), ('', "Caret Insertion Point", "/"),
('', "Division Slash", '/'), ('', "Division Slash", "/"),
('', "Fraction Slash", '/'), ('', "Fraction Slash", "/"),
('', "Box Drawings Light Diagonal Upper Right To Lower Left", '/'), ('', "Box Drawings Light Diagonal Upper Right To Lower Left", "/"),
('', "Mathematical Rising Diagonal", '/'), ('', "Mathematical Rising Diagonal", "/"),
('', "Big Solidus", '/'), ('', "Big Solidus", "/"),
('𝈺', "Greek Instrumental Notation Symbol-47", '/'), ('𝈺', "Greek Instrumental Notation Symbol-47", "/"),
('', "CJK Stroke Sp", '/'), ('', "CJK Stroke Sp", "/"),
('', "Vertical Kana Repeat Mark Upper Half", '/'), ('', "Vertical Kana Repeat Mark Upper Half", "/"),
('', "Coptic Capital Letter Old Coptic Esh", '/'), ('', "Coptic Capital Letter Old Coptic Esh", "/"),
('', "Katakana Letter No", '/'), ('', "Katakana Letter No", "/"),
('丿', "CJK Unified Ideograph-4E3F", '/'), ('丿', "CJK Unified Ideograph-4E3F", "/"),
('', "Kangxi Radical Slash", '/'), ('', "Kangxi Radical Slash", "/"),
('', "Fullwidth Solidus", '/'), ('', "Fullwidth Solidus", "/"),
('', "Fullwidth Reverse Solidus", '\\'), ('', "Fullwidth Reverse Solidus", "\\"),
('', "Small Reverse Solidus", '\\'), ('', "Small Reverse Solidus", "\\"),
('', "Set Minus", '\\'), ('', "Set Minus", "\\"),
('', "Mathematical Falling Diagonal", '\\'), ('', "Mathematical Falling Diagonal", "\\"),
('', "Reverse Solidus Operator", '\\'), ('', "Reverse Solidus Operator", "\\"),
('', "Big Reverse Solidus", '\\'), ('', "Big Reverse Solidus", "\\"),
('', "Greek Vocal Notation Symbol-16", '\\'), ('', "Greek Vocal Notation Symbol-16", "\\"),
('', "Greek Instrumental Symbol-48", '\\'), ('', "Greek Instrumental Symbol-48", "\\"),
('', "CJK Stroke D", '\\'), ('', "CJK Stroke D", "\\"),
('', "CJK Unified Ideograph-4E36", '\\'), ('', "CJK Unified Ideograph-4E36", "\\"),
('', "Kangxi Radical Dot", '\\'), ('', "Kangxi Radical Dot", "\\"),
('、', "Ideographic Comma", '\\'), ('、', "Ideographic Comma", "\\"),
('ヽ', "Katakana Iteration Mark", '\\'), ('ヽ', "Katakana Iteration Mark", "\\"),
('', "Latin Small Letter Um", '&'), ('', "Latin Small Letter Um", "&"),
('', "Fullwidth Ampersand", '&'), ('', "Fullwidth Ampersand", "&"),
('', "Runic Cross Punctuation", '+'), ('', "Runic Cross Punctuation", "+"),
('', "Heavy Plus Sign", '+'), ('', "Heavy Plus Sign", "+"),
('𐊛', "Lycian Letter H", '+'), ('𐊛', "Lycian Letter H", "+"),
('﬩', "Hebrew Letter Alternative Plus Sign", '+'), ('﬩', "Hebrew Letter Alternative Plus Sign", "+"),
('', "Fullwidth Plus Sign", '+'), ('', "Fullwidth Plus Sign", "+"),
('', "Single Left-Pointing Angle Quotation Mark", '<'), ('', "Single Left-Pointing Angle Quotation Mark", "<"),
('', "Heavy Left-Pointing Angle Quotation Mark Ornament", '<'), ('', "Heavy Left-Pointing Angle Quotation Mark Ornament", "<"),
('˂', "Modifier Letter Left Arrowhead", '<'), ('˂', "Modifier Letter Left Arrowhead", "<"),
('𝈶', "Greek Instrumental Symbol-40", '<'), ('𝈶', "Greek Instrumental Symbol-40", "<"),
('', "Canadian Syllabics Pa", '<'), ('', "Canadian Syllabics Pa", "<"),
('', "Runic Letter Kauna", '<'), ('', "Runic Letter Kauna", "<"),
('❬', "Medium Left-Pointing Angle Bracket Ornament", '<'), ('❬', "Medium Left-Pointing Angle Bracket Ornament", "<"),
('⟨', "Mathematical Left Angle Bracket", '<'), ('⟨', "Mathematical Left Angle Bracket", "<"),
('〈', "Left-Pointing Angle Bracket", '<'), ('〈', "Left-Pointing Angle Bracket", "<"),
('〈', "Left Angle Bracket", '<'), ('〈', "Left Angle Bracket", "<"),
('㇛', "CJK Stroke Pd", '<'), ('㇛', "CJK Stroke Pd", "<"),
('く', "Hiragana Letter Ku", '<'), ('く', "Hiragana Letter Ku", "<"),
('𡿨', "CJK Unified Ideograph-21FE8", '<'), ('𡿨', "CJK Unified Ideograph-21FE8", "<"),
('《', "Left Double Angle Bracket", '<'), ('《', "Left Double Angle Bracket", "<"),
('', "Fullwidth Less-Than Sign", '<'), ('', "Fullwidth Less-Than Sign", "<"),
('', "Canadian Syllabics Hyphen", '='), ('', "Canadian Syllabics Hyphen", "="),
('', "Double Hyphen", '='), ('', "Double Hyphen", "="),
('', "Katakana-Hiragana Double Hyphen", '='), ('', "Katakana-Hiragana Double Hyphen", "="),
('', "Lisu Punctuation Full Stop", '='), ('', "Lisu Punctuation Full Stop", "="),
('', "Fullwidth Equals Sign", '='), ('', "Fullwidth Equals Sign", "="),
('', "Single Right-Pointing Angle Quotation Mark", '>'), ('', "Single Right-Pointing Angle Quotation Mark", ">"),
('', "Heavy Right-Pointing Angle Quotation Mark Ornament", '>'), ('', "Heavy Right-Pointing Angle Quotation Mark Ornament", ">"),
('˃', "Modifier Letter Right Arrowhead", '>'), ('˃', "Modifier Letter Right Arrowhead", ">"),
('𝈷', "Greek Instrumental Symbol-42", '>'), ('𝈷', "Greek Instrumental Symbol-42", ">"),
('', "Canadian Syllabics Po", '>'), ('', "Canadian Syllabics Po", ">"),
('𖼿', "Miao Letter Archaic Zza", '>'), ('𖼿', "Miao Letter Archaic Zza", ">"),
('❭', "Medium Right-Pointing Angle Bracket Ornament", '>'), ('❭', "Medium Right-Pointing Angle Bracket Ornament", ">"),
('⟩', "Mathematical Right Angle Bracket", '>'), ('⟩', "Mathematical Right Angle Bracket", ">"),
('〉', "Right-Pointing Angle Bracket", '>'), ('〉', "Right-Pointing Angle Bracket", ">"),
('〉', "Right Angle Bracket", '>'), ('〉', "Right Angle Bracket", ">"),
('》', "Right Double Angle Bracket", '>'), ('》', "Right Double Angle Bracket", ">"),
('', "Fullwidth Greater-Than Sign", '>'), ('', "Fullwidth Greater-Than Sign", ">"),
('⩵', "Two Consecutive Equals Signs", "==")
]; ];
// FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, instead of // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, instead of
// keeping the substitution token in this table. Ideally, this should be inside `rustc_lexer`. // keeping the substitution token in this table. Ideally, this should be inside `rustc_lexer`.
// However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add // However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add
// fancier error recovery to it, as there will be less overall work to do this way. // fancier error recovery to it, as there will be less overall work to do this way.
const ASCII_ARRAY: &[(char, &str, Option<token::TokenKind>)] = &[ const ASCII_ARRAY: &[(&str, &str, Option<token::TokenKind>)] = &[
(' ', "Space", None), (" ", "Space", None),
('_', "Underscore", Some(token::Ident(kw::Underscore, false))), ("_", "Underscore", Some(token::Ident(kw::Underscore, false))),
('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))), ("-", "Minus/Hyphen", Some(token::BinOp(token::Minus))),
(',', "Comma", Some(token::Comma)), (",", "Comma", Some(token::Comma)),
(';', "Semicolon", Some(token::Semi)), (";", "Semicolon", Some(token::Semi)),
(':', "Colon", Some(token::Colon)), (":", "Colon", Some(token::Colon)),
('!', "Exclamation Mark", Some(token::Not)), ("!", "Exclamation Mark", Some(token::Not)),
('?', "Question Mark", Some(token::Question)), ("?", "Question Mark", Some(token::Question)),
('.', "Period", Some(token::Dot)), (".", "Period", Some(token::Dot)),
('(', "Left Parenthesis", Some(token::OpenDelim(Delimiter::Parenthesis))), ("(", "Left Parenthesis", Some(token::OpenDelim(Delimiter::Parenthesis))),
(')', "Right Parenthesis", Some(token::CloseDelim(Delimiter::Parenthesis))), (")", "Right Parenthesis", Some(token::CloseDelim(Delimiter::Parenthesis))),
('[', "Left Square Bracket", Some(token::OpenDelim(Delimiter::Bracket))), ("[", "Left Square Bracket", Some(token::OpenDelim(Delimiter::Bracket))),
(']', "Right Square Bracket", Some(token::CloseDelim(Delimiter::Bracket))), ("]", "Right Square Bracket", Some(token::CloseDelim(Delimiter::Bracket))),
('{', "Left Curly Brace", Some(token::OpenDelim(Delimiter::Brace))), ("{", "Left Curly Brace", Some(token::OpenDelim(Delimiter::Brace))),
('}', "Right Curly Brace", Some(token::CloseDelim(Delimiter::Brace))), ("}", "Right Curly Brace", Some(token::CloseDelim(Delimiter::Brace))),
('*', "Asterisk", Some(token::BinOp(token::Star))), ("*", "Asterisk", Some(token::BinOp(token::Star))),
('/', "Slash", Some(token::BinOp(token::Slash))), ("/", "Slash", Some(token::BinOp(token::Slash))),
('\\', "Backslash", None), ("\\", "Backslash", None),
('&', "Ampersand", Some(token::BinOp(token::And))), ("&", "Ampersand", Some(token::BinOp(token::And))),
('+', "Plus Sign", Some(token::BinOp(token::Plus))), ("+", "Plus Sign", Some(token::BinOp(token::Plus))),
('<', "Less-Than Sign", Some(token::Lt)), ("<", "Less-Than Sign", Some(token::Lt)),
('=', "Equals Sign", Some(token::Eq)), ("=", "Equals Sign", Some(token::Eq)),
('>', "Greater-Than Sign", Some(token::Gt)), ("==", "Double Equals Sign", Some(token::EqEq)),
(">", "Greater-Than Sign", Some(token::Gt)),
// FIXME: Literals are already lexed by this point, so we can't recover gracefully just by // FIXME: Literals are already lexed by this point, so we can't recover gracefully just by
// spitting the correct token out. // spitting the correct token out.
('\'', "Single Quote", None), ("\'", "Single Quote", None),
('"', "Quotation Mark", None), ("\"", "Quotation Mark", None),
]; ];
pub(super) fn check_for_substitution<'a>( pub(super) fn check_for_substitution<'a>(
@ -339,11 +341,11 @@ pub(super) fn check_for_substitution<'a>(
err: &mut Diagnostic, err: &mut Diagnostic,
count: usize, count: usize,
) -> Option<token::TokenKind> { ) -> Option<token::TokenKind> {
let &(_u_char, u_name, ascii_char) = UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch)?; let &(_, u_name, ascii_str) = UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch)?;
let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8() * count)); let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8() * count));
let Some((_ascii_char, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) else { let Some((_, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(s, _, _)| s == ascii_str) else {
let msg = format!("substitution character not found for '{}'", ch); let msg = format!("substitution character not found for '{}'", ch);
reader.sess.span_diagnostic.span_bug_no_panic(span, &msg); reader.sess.span_diagnostic.span_bug_no_panic(span, &msg);
return None; return None;
@ -354,7 +356,7 @@ pub(super) fn check_for_substitution<'a>(
let msg = format!( let msg = format!(
"Unicode characters '“' (Left Double Quotation Mark) and \ "Unicode characters '“' (Left Double Quotation Mark) and \
'”' (Right Double Quotation Mark) look like '{}' ({}), but are not", '”' (Right Double Quotation Mark) look like '{}' ({}), but are not",
ascii_char, ascii_name ascii_str, ascii_name
); );
err.span_suggestion( err.span_suggestion(
Span::with_root_ctxt( Span::with_root_ctxt(
@ -368,12 +370,12 @@ pub(super) fn check_for_substitution<'a>(
} else { } else {
let msg = format!( let msg = format!(
"Unicode character '{}' ({}) looks like '{}' ({}), but it is not", "Unicode character '{}' ({}) looks like '{}' ({}), but it is not",
ch, u_name, ascii_char, ascii_name ch, u_name, ascii_str, ascii_name
); );
err.span_suggestion( err.span_suggestion(
span, span,
&msg, &msg,
ascii_char.to_string().repeat(count), ascii_str.to_string().repeat(count),
Applicability::MaybeIncorrect, Applicability::MaybeIncorrect,
); );
} }

View file

@ -6,4 +6,7 @@ fn main() {
//~^ ERROR unknown start of token: \u{a0} //~^ ERROR unknown start of token: \u{a0}
//~^^ NOTE character appears 3 more times //~^^ NOTE character appears 3 more times
//~^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not //~^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
let _ = 1 2;
//~^ ERROR unknown start of token
//~^^ HELP Unicode character '⩵' (Two Consecutive Equals Signs) looks like '==' (Double Equals Sign), but it is not
} }

View file

@ -21,5 +21,16 @@ help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is
LL | let x = 0; LL | let x = 0;
| ++++ | ++++
error: aborting due to 2 previous errors error: unknown start of token: \u{2a75}
--> $DIR/unicode-chars.rs:9:15
|
LL | let _ = 1 ⩵ 2;
| ^
|
help: Unicode character '⩵' (Two Consecutive Equals Signs) looks like '==' (Double Equals Sign), but it is not
|
LL | let _ = 1 == 2;
| ~~
error: aborting due to 3 previous errors