lex: rm lex::Kind::Whitespace

Whitespace tokens do not necessarily create new events but they work as
a delimiter for words with attributes and affect some container
delimiters. Now when we can read the source from inline, we can instead
inspect for whitespace when needed.

Removing the whitespace token allows the lexer to continue a lot longer
without stopping. E.g. a typical line in a paragraph with no special
characters can turn into a single token.
This commit is contained in:
Noah Hellman 2023-02-19 18:28:00 +01:00
parent 9454a2e393
commit 86ee4ee520
3 changed files with 40 additions and 43 deletions

View file

@ -62,7 +62,6 @@ pub enum EventKind {
Exit(Container),
Atom(Atom),
Str,
Whitespace,
Attributes { container: bool },
Placeholder,
}
@ -240,11 +239,7 @@ impl<'s> Parser<'s> {
.or_else(|| self.parse_container(&first))
.or_else(|| self.parse_atom(&first))
.unwrap_or_else(|| {
self.push(if matches!(first.kind, lex::Kind::Whitespace) {
EventKind::Whitespace
} else {
EventKind::Str
});
self.push(EventKind::Str);
})
})
}
@ -296,7 +291,13 @@ impl<'s> Parser<'s> {
self.verbatim = None;
} else {
// continue verbatim
if matches!(first.kind, lex::Kind::Whitespace) {
let is_whitespace = self
.input
.span
.of(self.input.src)
.chars()
.all(char::is_whitespace);
if is_whitespace {
if !*non_whitespace_encountered
&& self.input.peek().map_or(false, |t| {
matches!(
@ -489,10 +490,14 @@ impl<'s> Parser<'s> {
// empty container
return None;
}
let whitespace_after = self.events.back().map_or(false, |ev| {
matches!(ev.kind, EventKind::Whitespace | EventKind::Atom(Softbreak))
let whitespace_before = self.events.back().map_or(false, |ev| {
ev.span
.of(self.input.src)
.chars()
.last()
.map_or(false, char::is_whitespace)
});
if opener.bidirectional() && whitespace_after {
if opener.bidirectional() && whitespace_before {
return None;
}
@ -577,19 +582,29 @@ impl<'s> Parser<'s> {
})
.or_else(|| {
let opener = Opener::from_token(first.kind)?;
if opener.bidirectional()
&& self
.input
.peek()
.map_or(true, |t| matches!(t.kind, lex::Kind::Whitespace))
{
let whitespace_after = self
.input
.lexer
.ahead()
.chars()
.next()
.map_or(true, char::is_whitespace);
if opener.bidirectional() && whitespace_after {
return None;
}
let whitespace_before = self.events.back().map_or(false, |ev| {
ev.span
.of(self.input.src)
.chars()
.last()
.map_or(false, char::is_whitespace)
});
if matches!(opener, Opener::SingleQuoted | Opener::DoubleQuoted)
&& self
.events
.back()
.map_or(false, |ev| matches!(ev.kind, EventKind::Str))
&& !whitespace_before
{
return None;
}
@ -675,10 +690,8 @@ impl<'s> Parser<'s> {
fn merge_str_events(&mut self, span_str: Span) -> Event {
let mut span = span_str;
let should_merge = |e: &Event, span: Span| {
matches!(
e.kind,
EventKind::Str | EventKind::Whitespace | EventKind::Placeholder
) && span.end() == e.span.start()
matches!(e.kind, EventKind::Str | EventKind::Placeholder)
&& span.end() == e.span.start()
};
while self.events.front().map_or(false, |e| should_merge(e, span)) {
let ev = self.events.pop_front().unwrap();
@ -882,9 +895,7 @@ impl<'s> Iterator for Parser<'s> {
|| self // for merge or attributes
.events
.back()
.map_or(false, |ev| {
matches!(ev.kind, EventKind::Str | EventKind::Whitespace)
})
.map_or(false, |ev| matches!(ev.kind, EventKind::Str))
{
if self.parse_event().is_none() {
if self.input.complete {
@ -911,7 +922,7 @@ impl<'s> Iterator for Parser<'s> {
self.events.pop_front().and_then(|e| match e.kind {
EventKind::Str if e.span.is_empty() => self.next(),
EventKind::Str | EventKind::Whitespace => Some(self.merge_str_events(e.span)),
EventKind::Str => Some(self.merge_str_events(e.span)),
EventKind::Placeholder | EventKind::Attributes { container: false } => self.next(),
_ => Some(e),
})

View file

@ -13,7 +13,6 @@ pub(crate) struct Token {
pub enum Kind {
Text,
Newline,
Whitespace,
Nbsp,
Hardbreak,
Escape,
@ -167,6 +166,8 @@ impl<'s> Lexer<'s> {
_ if escape && first == ' ' => Nbsp,
_ if escape => Text,
'\n' => Newline,
'\\' => {
if self
.peek_char()
@ -179,12 +180,6 @@ impl<'s> Lexer<'s> {
}
}
'\n' => Newline,
_ if first.is_whitespace() => {
self.eat_while(char::is_whitespace);
Whitespace
}
'[' => Open(Bracket),
']' => Close(Bracket),
'(' => Open(Paren),
@ -323,18 +318,11 @@ mod test {
test_lex!("abc", Text.l(3));
test_lex!(
"para w/ some _emphasis_ and *strong*.",
Text.l(4),
Whitespace.l(1),
Text.l(2),
Whitespace.l(1),
Text.l(4),
Whitespace.l(1),
Text.l(13),
Sym(Underscore).l(1),
Text.l(8),
Sym(Underscore).l(1),
Whitespace.l(1),
Text.l(3),
Whitespace.l(1),
Text.l(5),
Sym(Asterisk).l(1),
Text.l(6),
Sym(Asterisk).l(1),

View file

@ -910,9 +910,7 @@ impl<'s> Parser<'s> {
inline::Atom::Escape => Event::Escape,
},
inline::EventKind::Str => Event::Str(inline.span.of(self.src).into()),
inline::EventKind::Whitespace
| inline::EventKind::Attributes { .. }
| inline::EventKind::Placeholder => {
inline::EventKind::Attributes { .. } | inline::EventKind::Placeholder => {
panic!("{:?}", inline)
}
}