Rollup merge of #119033 - Zalathar:unicode, r=davidtwco

coverage: `llvm-cov` expects column numbers to be bytes, not code points Normally the compiler emits column numbers as a 1-based number of Unicode code points. But when we embed coverage mappings for `-Cinstrument-coverage`, those mappings will ultimately be read by the `llvm-cov` tool. That tool assumes that column numbers are 1-based numbers of *bytes*, and relies on that assumption when slicing up source code to apply highlighting (in HTML reports, and in text-based reports with colour). For the very common case of all-ASCII source code, bytes and code points are the same, so the difference isn't noticeable. But for code that contains non-ASCII characters, emitting column numbers as code points will result in `llvm-cov` slicing strings in the wrong places, producing mangled output or fatal errors. (See https://github.com/taiki-e/cargo-llvm-cov/issues/275 as an example of what can go wrong.)
2024-01-09 00:19:33 +01:00 · 2024-01-09 00:19:33 +01:00 · 70e3f8d240
commit 70e3f8d240
parent ee7d4c1561 6971e9332d
5 changed files with 201 additions and 21 deletions
--- a/compiler/rustc_mir_transform/src/coverage/mod.rs
+++ b/compiler/rustc_mir_transform/src/coverage/mod.rs
@ -23,7 +23,7 @@ use rustc_middle::mir::{
 use rustc_middle::ty::TyCtxt;
 use rustc_span::def_id::LocalDefId;
 use rustc_span::source_map::SourceMap;
-use rustc_span::{Span, Symbol};
+use rustc_span::{BytePos, Pos, RelativeBytePos, Span, Symbol};
 /// Inserts `StatementKind::Coverage` statements that either instrument the binary with injected
 /// counters, via intrinsic `llvm.instrprof.increment`, and/or inject metadata used during codegen
@ -107,6 +107,12 @@ impl<'a, 'tcx> Instrumentor<'a, 'tcx> {
        );
        let mappings = self.create_mappings(&coverage_spans, &coverage_counters);
        if mappings.is_empty() {
            // No spans could be converted into valid mappings, so skip this function.
            debug!("no spans could be converted into valid mappings; skipping");
            return;
        }
        self.inject_coverage_statements(bcb_has_coverage_spans, &coverage_counters);
        self.mir_body.function_coverage_info = Some(Box::new(FunctionCoverageInfo {
@ -148,9 +154,9 @@ impl<'a, 'tcx> Instrumentor<'a, 'tcx> {
            // Flatten the spans into individual term/span pairs.
            .flat_map(|(term, spans)| spans.iter().map(move |&span| (term, span)))
            // Convert each span to a code region, and create the final mapping.
-            .map(|(term, span)| {
+            .filter_map(|(term, span)| {
-                let code_region = make_code_region(source_map, file_name, span, body_span);
+                let code_region = make_code_region(source_map, file_name, span, body_span)?;
-                Mapping { term, code_region }
+                Some(Mapping { term, code_region })
            })
            .collect::<Vec<_>>()
    }
@ -252,13 +258,22 @@ fn inject_statement(mir_body: &mut mir::Body<'_>, counter_kind: CoverageKind, bb
    data.statements.insert(0, statement);
 }
-/// Convert the Span into its file name, start line and column, and end line and column
+/// Convert the Span into its file name, start line and column, and end line and column.
 ///
 /// Line numbers and column numbers are 1-based. Unlike most column numbers emitted by
 /// the compiler, these column numbers are denoted in **bytes**, because that's what
 /// LLVM's `llvm-cov` tool expects to see in coverage maps.
 ///
 /// Returns `None` if the conversion failed for some reason. This shouldn't happen,
 /// but it's hard to rule out entirely (especially in the presence of complex macros
 /// or other expansions), and if it does happen then skipping a span or function is
 /// better than an ICE or `llvm-cov` failure that the user might have no way to avoid.
 fn make_code_region(
    source_map: &SourceMap,
    file_name: Symbol,
    span: Span,
    body_span: Span,
-) -> CodeRegion {
+) -> Option<CodeRegion> {
    debug!(
        "Called make_code_region(file_name={}, span={}, body_span={})",
        file_name,
@ -266,27 +281,62 @@ fn make_code_region(
        source_map.span_to_diagnostic_string(body_span)
    );
-    let (file, mut start_line, mut start_col, mut end_line, mut end_col) =
+    let lo = span.lo();
-        source_map.span_to_location_info(span);
+    let hi = span.hi();
-    if span.hi() == span.lo() {
+
-        // Extend an empty span by one character so the region will be counted.
+    let file = source_map.lookup_source_file(lo);
-        if span.hi() == body_span.hi() {
+    if !file.contains(hi) {
-            start_col = start_col.saturating_sub(1);
+        debug!(?span, ?file, ?lo, ?hi, "span crosses multiple files; skipping");
-        } else {
+        return None;
            end_col = start_col + 1;
        }
    };
    if let Some(file) = file {
        start_line = source_map.doctest_offset_line(&file.name, start_line);
        end_line = source_map.doctest_offset_line(&file.name, end_line);
    }
-    CodeRegion {
+
    // Column numbers need to be in bytes, so we can't use the more convenient
    // `SourceMap` methods for looking up file coordinates.
    let rpos_and_line_and_byte_column = |pos: BytePos| -> Option<(RelativeBytePos, usize, usize)> {
        let rpos = file.relative_position(pos);
        let line_index = file.lookup_line(rpos)?;
        let line_start = file.lines()[line_index];
        // Line numbers and column numbers are 1-based, so add 1 to each.
        Some((rpos, line_index + 1, (rpos - line_start).to_usize() + 1))
    };
    let (lo_rpos, mut start_line, mut start_col) = rpos_and_line_and_byte_column(lo)?;
    let (hi_rpos, mut end_line, mut end_col) = rpos_and_line_and_byte_column(hi)?;
    // If the span is empty, try to expand it horizontally by one character's
    // worth of bytes, so that it is more visible in `llvm-cov` reports.
    // We do this after resolving line/column numbers, so that empty spans at the
    // end of a line get an extra column instead of wrapping to the next line.
    if span.is_empty()
        && body_span.contains(span)
        && let Some(src) = &file.src
    {
        // Prefer to expand the end position, if it won't go outside the body span.
        if hi < body_span.hi() {
            let hi_rpos = hi_rpos.to_usize();
            let nudge_bytes = src.ceil_char_boundary(hi_rpos + 1) - hi_rpos;
            end_col += nudge_bytes;
        } else if lo > body_span.lo() {
            let lo_rpos = lo_rpos.to_usize();
            let nudge_bytes = lo_rpos - src.floor_char_boundary(lo_rpos - 1);
            // Subtract the nudge, but don't go below column 1.
            start_col = start_col.saturating_sub(nudge_bytes).max(1);
        }
        // If neither nudge could be applied, stick with the empty span coordinates.
    }
    // Apply an offset so that code in doctests has correct line numbers.
    // FIXME(#79417): Currently we have no way to offset doctest _columns_.
    start_line = source_map.doctest_offset_line(&file.name, start_line);
    end_line = source_map.doctest_offset_line(&file.name, end_line);
    Some(CodeRegion {
        file_name,
        start_line: start_line as u32,
        start_col: start_col as u32,
        end_line: end_line as u32,
        end_col: end_col as u32,
-    }
+    })
 }
 fn is_eligible_for_coverage(tcx: TyCtxt<'_>, def_id: LocalDefId) -> bool {
--- a/compiler/rustc_mir_transform/src/lib.rs
+++ b/compiler/rustc_mir_transform/src/lib.rs
@ -9,6 +9,7 @@
 #![feature(min_specialization)]
 #![feature(never_type)]
 #![feature(option_get_or_insert_default)]
 #![feature(round_char_boundary)]
 #![feature(trusted_step)]
 #![feature(try_blocks)]
 #![feature(yeet_expr)]
--- a/tests/coverage/unicode.cov-map
+++ b/tests/coverage/unicode.cov-map
@ -0,0 +1,53 @@
 Function name: unicode::main
 Raw bytes (67): 0x[01, 01, 09, 01, 05, 03, 05, 1e, 0d, 22, 09, 03, 05, 11, 1b, 1e, 0d, 22, 09, 03, 05, 09, 01, 0e, 01, 00, 0b, 05, 01, 09, 00, 0c, 03, 00, 10, 00, 1b, 05, 00, 1c, 00, 28, 22, 02, 08, 00, 25, 09, 00, 29, 00, 46, 11, 00, 47, 02, 06, 1b, 02, 06, 00, 07, 17, 02, 05, 01, 02]
 Number of files: 1
 - file 0 => global file 1
 Number of expressions: 9
 - expression 0 operands: lhs = Counter(0), rhs = Counter(1)
 - expression 1 operands: lhs = Expression(0, Add), rhs = Counter(1)
 - expression 2 operands: lhs = Expression(7, Sub), rhs = Counter(3)
 - expression 3 operands: lhs = Expression(8, Sub), rhs = Counter(2)
 - expression 4 operands: lhs = Expression(0, Add), rhs = Counter(1)
 - expression 5 operands: lhs = Counter(4), rhs = Expression(6, Add)
 - expression 6 operands: lhs = Expression(7, Sub), rhs = Counter(3)
 - expression 7 operands: lhs = Expression(8, Sub), rhs = Counter(2)
 - expression 8 operands: lhs = Expression(0, Add), rhs = Counter(1)
 Number of file 0 mappings: 9
 - Code(Counter(0)) at (prev + 14, 1) to (start + 0, 11)
 - Code(Counter(1)) at (prev + 1, 9) to (start + 0, 12)
 - Code(Expression(0, Add)) at (prev + 0, 16) to (start + 0, 27)
    = (c0 + c1)
 - Code(Counter(1)) at (prev + 0, 28) to (start + 0, 40)
 - Code(Expression(8, Sub)) at (prev + 2, 8) to (start + 0, 37)
    = ((c0 + c1) - c1)
 - Code(Counter(2)) at (prev + 0, 41) to (start + 0, 70)
 - Code(Counter(4)) at (prev + 0, 71) to (start + 2, 6)
 - Code(Expression(6, Add)) at (prev + 2, 6) to (start + 0, 7)
    = ((((c0 + c1) - c1) - c2) + c3)
 - Code(Expression(5, Add)) at (prev + 2, 5) to (start + 1, 2)
    = (c4 + ((((c0 + c1) - c1) - c2) + c3))
 Function name: unicode::サビ
 Raw bytes (9): 0x[01, 01, 00, 01, 01, 1e, 14, 00, 18]
 Number of files: 1
 - file 0 => global file 1
 Number of expressions: 0
 Number of file 0 mappings: 1
 - Code(Counter(0)) at (prev + 30, 20) to (start + 0, 24)
 Function name: unicode::他 (unused)
 Raw bytes (9): 0x[01, 01, 00, 01, 00, 1e, 19, 00, 25]
 Number of files: 1
 - file 0 => global file 1
 Number of expressions: 0
 Number of file 0 mappings: 1
 - Code(Zero) at (prev + 30, 25) to (start + 0, 37)
 Function name: unicode::申し訳ございません
 Raw bytes (9): 0x[01, 01, 00, 01, 01, 18, 01, 02, 02]
 Number of files: 1
 - file 0 => global file 1
 Number of expressions: 0
 Number of file 0 mappings: 1
 - Code(Counter(0)) at (prev + 24, 1) to (start + 2, 2)
--- a/tests/coverage/unicode.coverage
+++ b/tests/coverage/unicode.coverage
@ -0,0 +1,40 @@
   LL|       |// edition: 2021
   LL|       |// ignore-windows - we can't force `llvm-cov` to use ANSI escapes on Windows
   LL|       |// llvm-cov-flags: --use-color
   LL|       |
   LL|       |// Check that column numbers are denoted in bytes, so that they don't cause
   LL|       |// `llvm-cov` to fail or emit malformed output.
   LL|       |//
   LL|       |// Note that when `llvm-cov` prints ^ arrows on a subsequent line, it simply
   LL|       |// inserts one space character for each "column", with no understanding of
   LL|       |// Unicode or character widths. So those arrows will tend to be misaligned
   LL|       |// for non-ASCII source code, regardless of whether column numbers are code
   LL|       |// points or bytes.
   LL|       |
   LL|      1|fn main() {
   LL|     [0;35m33[0m|    for _İ in 'А'..='Я' { /* Я */ }
                      ^32                ^32
   LL|       |
   LL|      [0;35m1[0m|    if 申し訳ございません() && [0;41m申し訳ございません()[0m [0;41m{[0m
                                                      ^0
   LL|      0|[0;41m        println!("true");[0m
   LL|      1|[0;41m    }[0m
   LL|       |
   LL|      1|    サビ();
   LL|      1|}
   LL|       |
   LL|      1|fn 申し訳ございません() -> bool {
   LL|      1|    std::hint::black_box(false)
   LL|      1|}
   LL|       |
   LL|       |macro_rules! macro_that_defines_a_function {
   LL|       |    (fn $名:ident () $体:tt) => {
   LL|      [0;35m1[0m|        fn $名 () $体 [0;41mfn 他 () {}[0m
                                      ^0
   LL|       |    }
   LL|       |}
   LL|       |
   LL|       |macro_that_defines_a_function! {
   LL|       |    fn サビ() {}
   LL|       |}
--- a/tests/coverage/unicode.rs
+++ b/tests/coverage/unicode.rs
@ -0,0 +1,36 @@
 // edition: 2021
 // ignore-windows - we can't force `llvm-cov` to use ANSI escapes on Windows
 // llvm-cov-flags: --use-color
 // Check that column numbers are denoted in bytes, so that they don't cause
 // `llvm-cov` to fail or emit malformed output.
 //
 // Note that when `llvm-cov` prints ^ arrows on a subsequent line, it simply
 // inserts one space character for each "column", with no understanding of
 // Unicode or character widths. So those arrows will tend to be misaligned
 // for non-ASCII source code, regardless of whether column numbers are code
 // points or bytes.
 fn main() {
    for _İ in 'А'..='Я' { /* Я */ }
    if 申し訳ございません() && 申し訳ございません() {
        println!("true");
    }
    サビ();
 }
 fn 申し訳ございません() -> bool {
    std::hint::black_box(false)
 }
 macro_rules! macro_that_defines_a_function {
    (fn $名:ident () $体:tt) => {
        fn $名 () $体 fn 他 () {}
    }
 }
 macro_that_defines_a_function! {
    fn サビ() {}
 }