Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit fe2ee80

Browse files
authoredJan 9, 2024
Unrolled build for rust-lang#119033
Rollup merge of rust-lang#119033 - Zalathar:unicode, r=davidtwco coverage: `llvm-cov` expects column numbers to be bytes, not code points Normally the compiler emits column numbers as a 1-based number of Unicode code points. But when we embed coverage mappings for `-Cinstrument-coverage`, those mappings will ultimately be read by the `llvm-cov` tool. That tool assumes that column numbers are 1-based numbers of *bytes*, and relies on that assumption when slicing up source code to apply highlighting (in HTML reports, and in text-based reports with colour). For the very common case of all-ASCII source code, bytes and code points are the same, so the difference isn't noticeable. But for code that contains non-ASCII characters, emitting column numbers as code points will result in `llvm-cov` slicing strings in the wrong places, producing mangled output or fatal errors. (See taiki-e/cargo-llvm-cov#275 as an example of what can go wrong.)
2 parents ca663b0 + 6971e93 commit fe2ee80

File tree

5 files changed

+200
-20
lines changed

5 files changed

+200
-20
lines changed
 

‎compiler/rustc_mir_transform/src/coverage/mod.rs

+70-20
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ use rustc_middle::mir::{
2323
use rustc_middle::ty::TyCtxt;
2424
use rustc_span::def_id::LocalDefId;
2525
use rustc_span::source_map::SourceMap;
26-
use rustc_span::{Span, Symbol};
26+
use rustc_span::{BytePos, Pos, RelativeBytePos, Span, Symbol};
2727

2828
/// Inserts `StatementKind::Coverage` statements that either instrument the binary with injected
2929
/// counters, via intrinsic `llvm.instrprof.increment`, and/or inject metadata used during codegen
@@ -107,6 +107,12 @@ impl<'a, 'tcx> Instrumentor<'a, 'tcx> {
107107
);
108108

109109
let mappings = self.create_mappings(&coverage_spans, &coverage_counters);
110+
if mappings.is_empty() {
111+
// No spans could be converted into valid mappings, so skip this function.
112+
debug!("no spans could be converted into valid mappings; skipping");
113+
return;
114+
}
115+
110116
self.inject_coverage_statements(bcb_has_coverage_spans, &coverage_counters);
111117

112118
self.mir_body.function_coverage_info = Some(Box::new(FunctionCoverageInfo {
@@ -148,9 +154,9 @@ impl<'a, 'tcx> Instrumentor<'a, 'tcx> {
148154
// Flatten the spans into individual term/span pairs.
149155
.flat_map(|(term, spans)| spans.iter().map(move |&span| (term, span)))
150156
// Convert each span to a code region, and create the final mapping.
151-
.map(|(term, span)| {
152-
let code_region = make_code_region(source_map, file_name, span, body_span);
153-
Mapping { term, code_region }
157+
.filter_map(|(term, span)| {
158+
let code_region = make_code_region(source_map, file_name, span, body_span)?;
159+
Some(Mapping { term, code_region })
154160
})
155161
.collect::<Vec<_>>()
156162
}
@@ -252,41 +258,85 @@ fn inject_statement(mir_body: &mut mir::Body<'_>, counter_kind: CoverageKind, bb
252258
data.statements.insert(0, statement);
253259
}
254260

255-
/// Convert the Span into its file name, start line and column, and end line and column
261+
/// Convert the Span into its file name, start line and column, and end line and column.
262+
///
263+
/// Line numbers and column numbers are 1-based. Unlike most column numbers emitted by
264+
/// the compiler, these column numbers are denoted in **bytes**, because that's what
265+
/// LLVM's `llvm-cov` tool expects to see in coverage maps.
266+
///
267+
/// Returns `None` if the conversion failed for some reason. This shouldn't happen,
268+
/// but it's hard to rule out entirely (especially in the presence of complex macros
269+
/// or other expansions), and if it does happen then skipping a span or function is
270+
/// better than an ICE or `llvm-cov` failure that the user might have no way to avoid.
256271
fn make_code_region(
257272
source_map: &SourceMap,
258273
file_name: Symbol,
259274
span: Span,
260275
body_span: Span,
261-
) -> CodeRegion {
276+
) -> Option<CodeRegion> {
262277
debug!(
263278
"Called make_code_region(file_name={}, span={}, body_span={})",
264279
file_name,
265280
source_map.span_to_diagnostic_string(span),
266281
source_map.span_to_diagnostic_string(body_span)
267282
);
268283

269-
let (file, mut start_line, mut start_col, mut end_line, mut end_col) =
270-
source_map.span_to_location_info(span);
271-
if span.hi() == span.lo() {
272-
// Extend an empty span by one character so the region will be counted.
273-
if span.hi() == body_span.hi() {
274-
start_col = start_col.saturating_sub(1);
275-
} else {
276-
end_col = start_col + 1;
277-
}
284+
let lo = span.lo();
285+
let hi = span.hi();
286+
287+
let file = source_map.lookup_source_file(lo);
288+
if !file.contains(hi) {
289+
debug!(?span, ?file, ?lo, ?hi, "span crosses multiple files; skipping");
290+
return None;
291+
}
292+
293+
// Column numbers need to be in bytes, so we can't use the more convenient
294+
// `SourceMap` methods for looking up file coordinates.
295+
let rpos_and_line_and_byte_column = |pos: BytePos| -> Option<(RelativeBytePos, usize, usize)> {
296+
let rpos = file.relative_position(pos);
297+
let line_index = file.lookup_line(rpos)?;
298+
let line_start = file.lines()[line_index];
299+
// Line numbers and column numbers are 1-based, so add 1 to each.
300+
Some((rpos, line_index + 1, (rpos - line_start).to_usize() + 1))
278301
};
279-
if let Some(file) = file {
280-
start_line = source_map.doctest_offset_line(&file.name, start_line);
281-
end_line = source_map.doctest_offset_line(&file.name, end_line);
302+
303+
let (lo_rpos, mut start_line, mut start_col) = rpos_and_line_and_byte_column(lo)?;
304+
let (hi_rpos, mut end_line, mut end_col) = rpos_and_line_and_byte_column(hi)?;
305+
306+
// If the span is empty, try to expand it horizontally by one character's
307+
// worth of bytes, so that it is more visible in `llvm-cov` reports.
308+
// We do this after resolving line/column numbers, so that empty spans at the
309+
// end of a line get an extra column instead of wrapping to the next line.
310+
if span.is_empty()
311+
&& body_span.contains(span)
312+
&& let Some(src) = &file.src
313+
{
314+
// Prefer to expand the end position, if it won't go outside the body span.
315+
if hi < body_span.hi() {
316+
let hi_rpos = hi_rpos.to_usize();
317+
let nudge_bytes = src.ceil_char_boundary(hi_rpos + 1) - hi_rpos;
318+
end_col += nudge_bytes;
319+
} else if lo > body_span.lo() {
320+
let lo_rpos = lo_rpos.to_usize();
321+
let nudge_bytes = lo_rpos - src.floor_char_boundary(lo_rpos - 1);
322+
// Subtract the nudge, but don't go below column 1.
323+
start_col = start_col.saturating_sub(nudge_bytes).max(1);
324+
}
325+
// If neither nudge could be applied, stick with the empty span coordinates.
282326
}
283-
CodeRegion {
327+
328+
// Apply an offset so that code in doctests has correct line numbers.
329+
// FIXME(#79417): Currently we have no way to offset doctest _columns_.
330+
start_line = source_map.doctest_offset_line(&file.name, start_line);
331+
end_line = source_map.doctest_offset_line(&file.name, end_line);
332+
333+
Some(CodeRegion {
284334
file_name,
285335
start_line: start_line as u32,
286336
start_col: start_col as u32,
287337
end_line: end_line as u32,
288338
end_col: end_col as u32,
289-
}
339+
})
290340
}
291341

292342
fn is_eligible_for_coverage(tcx: TyCtxt<'_>, def_id: LocalDefId) -> bool {

‎compiler/rustc_mir_transform/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#![feature(min_specialization)]
1010
#![feature(never_type)]
1111
#![feature(option_get_or_insert_default)]
12+
#![feature(round_char_boundary)]
1213
#![feature(trusted_step)]
1314
#![feature(try_blocks)]
1415
#![feature(yeet_expr)]

‎tests/coverage/unicode.cov-map

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
Function name: unicode::main
2+
Raw bytes (67): 0x[01, 01, 09, 01, 05, 03, 05, 1e, 0d, 22, 09, 03, 05, 11, 1b, 1e, 0d, 22, 09, 03, 05, 09, 01, 0e, 01, 00, 0b, 05, 01, 09, 00, 0c, 03, 00, 10, 00, 1b, 05, 00, 1c, 00, 28, 22, 02, 08, 00, 25, 09, 00, 29, 00, 46, 11, 00, 47, 02, 06, 1b, 02, 06, 00, 07, 17, 02, 05, 01, 02]
3+
Number of files: 1
4+
- file 0 => global file 1
5+
Number of expressions: 9
6+
- expression 0 operands: lhs = Counter(0), rhs = Counter(1)
7+
- expression 1 operands: lhs = Expression(0, Add), rhs = Counter(1)
8+
- expression 2 operands: lhs = Expression(7, Sub), rhs = Counter(3)
9+
- expression 3 operands: lhs = Expression(8, Sub), rhs = Counter(2)
10+
- expression 4 operands: lhs = Expression(0, Add), rhs = Counter(1)
11+
- expression 5 operands: lhs = Counter(4), rhs = Expression(6, Add)
12+
- expression 6 operands: lhs = Expression(7, Sub), rhs = Counter(3)
13+
- expression 7 operands: lhs = Expression(8, Sub), rhs = Counter(2)
14+
- expression 8 operands: lhs = Expression(0, Add), rhs = Counter(1)
15+
Number of file 0 mappings: 9
16+
- Code(Counter(0)) at (prev + 14, 1) to (start + 0, 11)
17+
- Code(Counter(1)) at (prev + 1, 9) to (start + 0, 12)
18+
- Code(Expression(0, Add)) at (prev + 0, 16) to (start + 0, 27)
19+
= (c0 + c1)
20+
- Code(Counter(1)) at (prev + 0, 28) to (start + 0, 40)
21+
- Code(Expression(8, Sub)) at (prev + 2, 8) to (start + 0, 37)
22+
= ((c0 + c1) - c1)
23+
- Code(Counter(2)) at (prev + 0, 41) to (start + 0, 70)
24+
- Code(Counter(4)) at (prev + 0, 71) to (start + 2, 6)
25+
- Code(Expression(6, Add)) at (prev + 2, 6) to (start + 0, 7)
26+
= ((((c0 + c1) - c1) - c2) + c3)
27+
- Code(Expression(5, Add)) at (prev + 2, 5) to (start + 1, 2)
28+
= (c4 + ((((c0 + c1) - c1) - c2) + c3))
29+
30+
Function name: unicode::サビ
31+
Raw bytes (9): 0x[01, 01, 00, 01, 01, 1e, 14, 00, 18]
32+
Number of files: 1
33+
- file 0 => global file 1
34+
Number of expressions: 0
35+
Number of file 0 mappings: 1
36+
- Code(Counter(0)) at (prev + 30, 20) to (start + 0, 24)
37+
38+
Function name: unicode::他 (unused)
39+
Raw bytes (9): 0x[01, 01, 00, 01, 00, 1e, 19, 00, 25]
40+
Number of files: 1
41+
- file 0 => global file 1
42+
Number of expressions: 0
43+
Number of file 0 mappings: 1
44+
- Code(Zero) at (prev + 30, 25) to (start + 0, 37)
45+
46+
Function name: unicode::申し訳ございません
47+
Raw bytes (9): 0x[01, 01, 00, 01, 01, 18, 01, 02, 02]
48+
Number of files: 1
49+
- file 0 => global file 1
50+
Number of expressions: 0
51+
Number of file 0 mappings: 1
52+
- Code(Counter(0)) at (prev + 24, 1) to (start + 2, 2)
53+

‎tests/coverage/unicode.coverage

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
LL| |// edition: 2021
2+
LL| |// ignore-windows - we can't force `llvm-cov` to use ANSI escapes on Windows
3+
LL| |// llvm-cov-flags: --use-color
4+
LL| |
5+
LL| |// Check that column numbers are denoted in bytes, so that they don't cause
6+
LL| |// `llvm-cov` to fail or emit malformed output.
7+
LL| |//
8+
LL| |// Note that when `llvm-cov` prints ^ arrows on a subsequent line, it simply
9+
LL| |// inserts one space character for each "column", with no understanding of
10+
LL| |// Unicode or character widths. So those arrows will tend to be misaligned
11+
LL| |// for non-ASCII source code, regardless of whether column numbers are code
12+
LL| |// points or bytes.
13+
LL| |
14+
LL| 1|fn main() {
15+
LL| 33| for _İ in 'А'..='Я' { /* Я */ }
16+
^32 ^32
17+
LL| |
18+
LL| 1| if 申し訳ございません() && 申し訳ございません() {
19+
^0
20+
LL| 0| println!("true");
21+
LL| 1| }
22+
LL| |
23+
LL| 1| サビ();
24+
LL| 1|}
25+
LL| |
26+
LL| 1|fn 申し訳ございません() -> bool {
27+
LL| 1| std::hint::black_box(false)
28+
LL| 1|}
29+
LL| |
30+
LL| |macro_rules! macro_that_defines_a_function {
31+
LL| | (fn $名:ident () $体:tt) => {
32+
LL| 1| fn $名 () $体 fn 他 () {}
33+
^0
34+
LL| | }
35+
LL| |}
36+
LL| |
37+
LL| |macro_that_defines_a_function! {
38+
LL| | fn サビ() {}
39+
LL| |}
40+

‎tests/coverage/unicode.rs

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// edition: 2021
2+
// ignore-windows - we can't force `llvm-cov` to use ANSI escapes on Windows
3+
// llvm-cov-flags: --use-color
4+
5+
// Check that column numbers are denoted in bytes, so that they don't cause
6+
// `llvm-cov` to fail or emit malformed output.
7+
//
8+
// Note that when `llvm-cov` prints ^ arrows on a subsequent line, it simply
9+
// inserts one space character for each "column", with no understanding of
10+
// Unicode or character widths. So those arrows will tend to be misaligned
11+
// for non-ASCII source code, regardless of whether column numbers are code
12+
// points or bytes.
13+
14+
fn main() {
15+
forin 'А'..='Я' { /* Я */ }
16+
17+
if 申し訳ございません() && 申し訳ございません() {
18+
println!("true");
19+
}
20+
21+
サビ();
22+
}
23+
24+
fn 申し訳ございません() -> bool {
25+
std::hint::black_box(false)
26+
}
27+
28+
macro_rules! macro_that_defines_a_function {
29+
(fn $名:ident () $体:tt) => {
30+
fn $名 () $体 fn() {}
31+
}
32+
}
33+
34+
macro_that_defines_a_function! {
35+
fn サビ() {}
36+
}

0 commit comments

Comments
 (0)
Failed to load comments.