Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lexer tweaks #133070

Merged
merged 6 commits into from
Nov 27, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
@@ -566,19 +566,19 @@ impl Cursor<'_> {

fn c_or_byte_string(
&mut self,
mk_kind: impl FnOnce(bool) -> LiteralKind,
mk_kind_raw: impl FnOnce(Option<u8>) -> LiteralKind,
mk_kind: fn(bool) -> LiteralKind,
mk_kind_raw: fn(Option<u8>) -> LiteralKind,
single_quoted: Option<fn(bool) -> LiteralKind>,
) -> TokenKind {
match (self.first(), self.second(), single_quoted) {
('\'', _, Some(mk_kind)) => {
('\'', _, Some(single_quoted)) => {
self.bump();
let terminated = self.single_quoted_string();
let suffix_start = self.pos_within_token();
if terminated {
self.eat_literal_suffix();
}
let kind = mk_kind(terminated);
let kind = single_quoted(terminated);
Literal { kind, suffix_start }
}
('"', _, _) => {
72 changes: 31 additions & 41 deletions compiler/rustc_lexer/src/tests.rs
Original file line number Diff line number Diff line change
@@ -77,61 +77,51 @@ fn test_too_many_hashes() {
check_raw_str(&s2, Err(RawStrError::TooManyDelimiters { found: u32::from(max_count) + 1 }));
}

// https://github.com/rust-lang/rust/issues/70528
#[test]
fn test_valid_shebang() {
// https://github.com/rust-lang/rust/issues/70528
let input = "#!/usr/bin/rustrun\nlet x = 5;";
assert_eq!(strip_shebang(input), Some(18));
}
let input = "#!/bin/bash";
assert_eq!(strip_shebang(input), Some(input.len()));

#[test]
fn test_invalid_shebang_valid_rust_syntax() {
// https://github.com/rust-lang/rust/issues/70528
let input = "#! [bad_attribute]";
let input = "#![attribute]";
assert_eq!(strip_shebang(input), None);
}

#[test]
fn test_shebang_second_line() {
// Because shebangs are interpreted by the kernel, they must be on the first line
let input = "\n#!/bin/bash";
let input = "#! /bin/bash";
assert_eq!(strip_shebang(input), Some(input.len()));

let input = "#! [attribute]";
assert_eq!(strip_shebang(input), None);
}

#[test]
fn test_shebang_space() {
let input = "#! /bin/bash";
let input = "#! /* blah */ /bin/bash";
assert_eq!(strip_shebang(input), Some(input.len()));
}

#[test]
fn test_shebang_empty_shebang() {
let input = "#! \n[attribute(foo)]";
let input = "#! /* blah */ [attribute]";
assert_eq!(strip_shebang(input), None);
}

#[test]
fn test_invalid_shebang_comment() {
let input = "#!//bin/ami/a/comment\n[";
assert_eq!(strip_shebang(input), None)
}
let input = "#! // blah\n/bin/bash";
assert_eq!(strip_shebang(input), Some(10)); // strip up to the newline

#[test]
fn test_invalid_shebang_another_comment() {
let input = "#!/*bin/ami/a/comment*/\n[attribute";
assert_eq!(strip_shebang(input), None)
}
let input = "#! // blah\n[attribute]";
assert_eq!(strip_shebang(input), None);

#[test]
fn test_shebang_valid_rust_after() {
let input = "#!/*bin/ami/a/comment*/\npub fn main() {}";
assert_eq!(strip_shebang(input), Some(23))
}
let input = "#! /* blah\nblah\nblah */ /bin/bash";
assert_eq!(strip_shebang(input), Some(10));

#[test]
fn test_shebang_followed_by_attrib() {
let input = "#!/bin/rust-scripts\n#![allow_unused(true)]";
assert_eq!(strip_shebang(input), Some(19));
let input = "#! /* blah\nblah\nblah */ [attribute]";
assert_eq!(strip_shebang(input), None);

let input = "#!\n/bin/sh";
assert_eq!(strip_shebang(input), Some(2));

let input = "#!\n[attribute]";
assert_eq!(strip_shebang(input), None);

// Because shebangs are interpreted by the kernel, they must be on the first line
let input = "\n#!/bin/bash";
assert_eq!(strip_shebang(input), None);

let input = "\n#![attribute]";
assert_eq!(strip_shebang(input), None);
}

fn check_lexing(src: &str, expect: Expect) {
75 changes: 44 additions & 31 deletions compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
@@ -18,6 +18,7 @@ use rustc_span::symbol::Symbol;
use rustc_span::{BytePos, Pos, Span};
use tracing::debug;

use crate::lexer::diagnostics::TokenTreeDiagInfo;
use crate::lexer::unicode_chars::UNICODE_ARRAY;
use crate::{errors, make_unclosed_delims_error};

@@ -56,7 +57,7 @@ pub(crate) fn lex_token_trees<'psess, 'src>(
}

let cursor = Cursor::new(src);
let string_reader = StringReader {
let mut lexer = Lexer {
psess,
start_pos,
pos: start_pos,
@@ -65,34 +66,31 @@ pub(crate) fn lex_token_trees<'psess, 'src>(
override_span,
nbsp_is_whitespace: false,
last_lifetime: None,
token: Token::dummy(),
diag_info: TokenTreeDiagInfo::default(),
};
let (stream, res, unmatched_delims) =
tokentrees::TokenTreesReader::lex_all_token_trees(string_reader);
match res {
Ok(()) if unmatched_delims.is_empty() => Ok(stream),
_ => {
// Return error if there are unmatched delimiters or unclosed delimiters.
// We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch
// because the delimiter mismatch is more likely to be the root cause of error

let mut buffer = Vec::with_capacity(1);
for unmatched in unmatched_delims {
if let Some(err) = make_unclosed_delims_error(unmatched, psess) {
buffer.push(err);
}
}
if let Err(errs) = res {
// Add unclosing delimiter or diff marker errors
for err in errs {
buffer.push(err);
}
}
Err(buffer)
let (_open_spacing, stream, res) = lexer.lex_token_trees(/* is_delimited */ false);
let unmatched_delims = lexer.diag_info.unmatched_delims;

if res.is_ok() && unmatched_delims.is_empty() {
Ok(stream)
} else {
// Return error if there are unmatched delimiters or unclosed delimiters.
// We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch
// because the delimiter mismatch is more likely to be the root cause of error
let mut buffer: Vec<_> = unmatched_delims
.into_iter()
.filter_map(|unmatched_delim| make_unclosed_delims_error(unmatched_delim, psess))
.collect();
if let Err(errs) = res {
// Add unclosing delimiter or diff marker errors
buffer.extend(errs);
}
Err(buffer)
}
}

struct StringReader<'psess, 'src> {
struct Lexer<'psess, 'src> {
psess: &'psess ParseSess,
/// Initial position, read-only.
start_pos: BytePos,
@@ -111,9 +109,14 @@ struct StringReader<'psess, 'src> {
/// Track the `Span` for the leading `'` of the last lifetime. Used for
/// diagnostics to detect possible typo where `"` was meant.
last_lifetime: Option<Span>,

/// The current token.
token: Token,

diag_info: TokenTreeDiagInfo,
}

impl<'psess, 'src> StringReader<'psess, 'src> {
impl<'psess, 'src> Lexer<'psess, 'src> {
fn dcx(&self) -> DiagCtxtHandle<'psess> {
self.psess.dcx()
}
@@ -124,7 +127,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {

/// Returns the next token, paired with a bool indicating if the token was
/// preceded by whitespace.
fn next_token(&mut self) -> (Token, bool) {
fn next_token_from_cursor(&mut self) -> (Token, bool) {
let mut preceded_by_whitespace = false;
let mut swallow_next_invalid = 0;
// Skip trivial (whitespace & comments) tokens
@@ -231,7 +234,8 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
.push(span);
token::Ident(sym, IdentIsRaw::No)
}
// split up (raw) c string literals to an ident and a string literal when edition < 2021.
// split up (raw) c string literals to an ident and a string literal when edition <
// 2021.
rustc_lexer::TokenKind::Literal {
kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }),
suffix_start: _,
@@ -252,7 +256,9 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
let prefix_span = self.mk_sp(start, lit_start);
return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
}
rustc_lexer::TokenKind::GuardedStrPrefix => self.maybe_report_guarded_str(start, str_before),
rustc_lexer::TokenKind::GuardedStrPrefix => {
self.maybe_report_guarded_str(start, str_before)
}
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
let suffix_start = start + BytePos(suffix_start);
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
@@ -296,13 +302,20 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
if prefix_span.at_least_rust_2021() {
let span = self.mk_sp(start, self.pos);

let lifetime_name_without_tick = Symbol::intern(&self.str_from(ident_start));
let lifetime_name_without_tick =
Symbol::intern(&self.str_from(ident_start));
if !lifetime_name_without_tick.can_be_raw() {
self.dcx().emit_err(errors::CannotBeRawLifetime { span, ident: lifetime_name_without_tick });
self.dcx().emit_err(
errors::CannotBeRawLifetime {
span,
ident: lifetime_name_without_tick
}
);
}

// Put the `'` back onto the lifetime name.
let mut lifetime_name = String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
let mut lifetime_name =
String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
lifetime_name.push('\'');
lifetime_name += lifetime_name_without_tick.as_str();
let sym = Symbol::intern(&lifetime_name);
83 changes: 38 additions & 45 deletions compiler/rustc_parse/src/lexer/tokentrees.rs
Original file line number Diff line number Diff line change
@@ -4,41 +4,19 @@ use rustc_ast_pretty::pprust::token_to_string;
use rustc_errors::{Applicability, PErr};
use rustc_span::symbol::kw;

use super::diagnostics::{
TokenTreeDiagInfo, report_suspicious_mismatch_block, same_indentation_level,
};
use super::{StringReader, UnmatchedDelim};
use super::diagnostics::{report_suspicious_mismatch_block, same_indentation_level};
use super::{Lexer, UnmatchedDelim};
use crate::Parser;

pub(super) struct TokenTreesReader<'psess, 'src> {
string_reader: StringReader<'psess, 'src>,
/// The "next" token, which has been obtained from the `StringReader` but
/// not yet handled by the `TokenTreesReader`.
token: Token,
diag_info: TokenTreeDiagInfo,
}

impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
pub(super) fn lex_all_token_trees(
string_reader: StringReader<'psess, 'src>,
) -> (TokenStream, Result<(), Vec<PErr<'psess>>>, Vec<UnmatchedDelim>) {
let mut tt_reader = TokenTreesReader {
string_reader,
token: Token::dummy(),
diag_info: TokenTreeDiagInfo::default(),
};
let (_open_spacing, stream, res) = tt_reader.lex_token_trees(/* is_delimited */ false);
(stream, res, tt_reader.diag_info.unmatched_delims)
}

impl<'psess, 'src> Lexer<'psess, 'src> {
// Lex into a token stream. The `Spacing` in the result is that of the
// opening delimiter.
fn lex_token_trees(
pub(super) fn lex_token_trees(
&mut self,
is_delimited: bool,
) -> (Spacing, TokenStream, Result<(), Vec<PErr<'psess>>>) {
// Move past the opening delimiter.
let (_, open_spacing) = self.bump(false);
let open_spacing = self.bump_minimal();

let mut buf = Vec::new();
loop {
@@ -71,7 +49,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
}
_ => {
// Get the next normal token.
let (this_tok, this_spacing) = self.bump(true);
let (this_tok, this_spacing) = self.bump();
buf.push(TokenTree::Token(this_tok, this_spacing));
}
}
@@ -80,7 +58,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {

fn eof_err(&mut self) -> PErr<'psess> {
let msg = "this file contains an unclosed delimiter";
let mut err = self.string_reader.dcx().struct_span_err(self.token.span, msg);
let mut err = self.dcx().struct_span_err(self.token.span, msg);

let unclosed_delimiter_show_limit = 5;
let len = usize::min(unclosed_delimiter_show_limit, self.diag_info.open_braces.len());
@@ -110,7 +88,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
report_suspicious_mismatch_block(
&mut err,
&self.diag_info,
self.string_reader.psess.source_map(),
self.psess.source_map(),
*delim,
)
}
@@ -136,7 +114,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {

// Expand to cover the entire delimited token tree.
let delim_span = DelimSpan::from_pair(pre_span, self.token.span);
let sm = self.string_reader.psess.source_map();
let sm = self.psess.source_map();

let close_spacing = match self.token.kind {
// Correct delimiter.
@@ -160,7 +138,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
}

// Move past the closing delimiter.
self.bump(false).1
self.bump_minimal()
}
// Incorrect delimiter.
token::CloseDelim(close_delim) => {
@@ -203,7 +181,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
// bar(baz(
// } // Incorrect delimiter but matches the earlier `{`
if !self.diag_info.open_braces.iter().any(|&(b, _)| b == close_delim) {
self.bump(false).1
self.bump_minimal()
} else {
// The choice of value here doesn't matter.
Spacing::Alone
@@ -225,14 +203,14 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
}

// Move on to the next token, returning the current token and its spacing.
// Will glue adjacent single-char tokens together if `glue` is set.
fn bump(&mut self, glue: bool) -> (Token, Spacing) {
// Will glue adjacent single-char tokens together.
fn bump(&mut self) -> (Token, Spacing) {
let (this_spacing, next_tok) = loop {
let (next_tok, is_next_tok_preceded_by_whitespace) = self.string_reader.next_token();
let (next_tok, is_next_tok_preceded_by_whitespace) = self.next_token_from_cursor();

if is_next_tok_preceded_by_whitespace {
break (Spacing::Alone, next_tok);
} else if glue && let Some(glued) = self.token.glue(&next_tok) {
} else if let Some(glued) = self.token.glue(&next_tok) {
self.token = glued;
} else {
let this_spacing = if next_tok.is_punct() {
@@ -249,14 +227,34 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
(this_tok, this_spacing)
}

// Cut-down version of `bump` used when the token kind is known in advance.
fn bump_minimal(&mut self) -> Spacing {
let (next_tok, is_next_tok_preceded_by_whitespace) = self.next_token_from_cursor();

let this_spacing = if is_next_tok_preceded_by_whitespace {
Spacing::Alone
} else {
if next_tok.is_punct() {
Spacing::Joint
} else if next_tok == token::Eof {
Spacing::Alone
} else {
Spacing::JointHidden
}
};

self.token = next_tok;
this_spacing
}

fn unclosed_delim_err(
&mut self,
tts: TokenStream,
mut errs: Vec<PErr<'psess>>,
) -> Vec<PErr<'psess>> {
// If there are unclosed delims, see if there are diff markers and if so, point them
// out instead of complaining about the unclosed delims.
let mut parser = Parser::new(self.string_reader.psess, tts, None);
let mut parser = Parser::new(self.psess, tts, None);
let mut diff_errs = vec![];
// Suggest removing a `{` we think appears in an `if`/`while` condition.
// We want to suggest removing a `{` only if we think we're in an `if`/`while` condition,
@@ -314,14 +312,9 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> {
// An unexpected closing delimiter (i.e., there is no matching opening delimiter).
let token_str = token_to_string(&self.token);
let msg = format!("unexpected closing delimiter: `{token_str}`");
let mut err = self.string_reader.dcx().struct_span_err(self.token.span, msg);
let mut err = self.dcx().struct_span_err(self.token.span, msg);

report_suspicious_mismatch_block(
&mut err,
&self.diag_info,
self.string_reader.psess.source_map(),
delim,
);
report_suspicious_mismatch_block(&mut err, &self.diag_info, self.psess.source_map(), delim);
err.span_label(self.token.span, "unexpected closing delimiter");
err
}
8 changes: 4 additions & 4 deletions compiler/rustc_parse/src/lexer/unicode_chars.rs
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@
use rustc_span::symbol::kw;
use rustc_span::{BytePos, Pos, Span};

use super::StringReader;
use super::Lexer;
use crate::errors::TokenSubstitution;
use crate::token::{self, Delimiter};

@@ -338,7 +338,7 @@ const ASCII_ARRAY: &[(&str, &str, Option<token::TokenKind>)] = &[
];

pub(super) fn check_for_substitution(
reader: &StringReader<'_, '_>,
lexer: &Lexer<'_, '_>,
pos: BytePos,
ch: char,
count: usize,
@@ -351,11 +351,11 @@ pub(super) fn check_for_substitution(

let Some((_, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(s, _, _)| s == ascii_str) else {
let msg = format!("substitution character not found for '{ch}'");
reader.dcx().span_bug(span, msg);
lexer.dcx().span_bug(span, msg);
};

// special help suggestion for "directed" double quotes
let sugg = if let Some(s) = peek_delimited(&reader.src[reader.src_index(pos)..], '“', '”') {
let sugg = if let Some(s) = peek_delimited(&lexer.src[lexer.src_index(pos)..], '“', '”') {
let span = Span::with_root_ctxt(
pos,
pos + Pos::from_usize('“'.len_utf8() + s.len() + '”'.len_utf8()),
Loading