lex: rm lex::Kind::Whitespace

Whitespace tokens do not necessarily create new events but they work as
a delimiter for words with attributes and affect some container
delimiters. Now when we can read the source from inline, we can instead
inspect for whitespace when needed.

Removing the whitespace token allows the lexer to continue a lot longer
without stopping. E.g. a typical line in a paragraph with no special
characters can turn into a single token.
This commit is contained in:
Noah Hellman 2023-02-19 18:28:00 +01:00
parent 9454a2e393
commit 86ee4ee520
3 changed files with 40 additions and 43 deletions

View file

@ -62,7 +62,6 @@ pub enum EventKind {
Exit(Container), Exit(Container),
Atom(Atom), Atom(Atom),
Str, Str,
Whitespace,
Attributes { container: bool }, Attributes { container: bool },
Placeholder, Placeholder,
} }
@ -240,11 +239,7 @@ impl<'s> Parser<'s> {
.or_else(|| self.parse_container(&first)) .or_else(|| self.parse_container(&first))
.or_else(|| self.parse_atom(&first)) .or_else(|| self.parse_atom(&first))
.unwrap_or_else(|| { .unwrap_or_else(|| {
self.push(if matches!(first.kind, lex::Kind::Whitespace) { self.push(EventKind::Str);
EventKind::Whitespace
} else {
EventKind::Str
});
}) })
}) })
} }
@ -296,7 +291,13 @@ impl<'s> Parser<'s> {
self.verbatim = None; self.verbatim = None;
} else { } else {
// continue verbatim // continue verbatim
if matches!(first.kind, lex::Kind::Whitespace) { let is_whitespace = self
.input
.span
.of(self.input.src)
.chars()
.all(char::is_whitespace);
if is_whitespace {
if !*non_whitespace_encountered if !*non_whitespace_encountered
&& self.input.peek().map_or(false, |t| { && self.input.peek().map_or(false, |t| {
matches!( matches!(
@ -489,10 +490,14 @@ impl<'s> Parser<'s> {
// empty container // empty container
return None; return None;
} }
let whitespace_after = self.events.back().map_or(false, |ev| { let whitespace_before = self.events.back().map_or(false, |ev| {
matches!(ev.kind, EventKind::Whitespace | EventKind::Atom(Softbreak)) ev.span
.of(self.input.src)
.chars()
.last()
.map_or(false, char::is_whitespace)
}); });
if opener.bidirectional() && whitespace_after { if opener.bidirectional() && whitespace_before {
return None; return None;
} }
@ -577,19 +582,29 @@ impl<'s> Parser<'s> {
}) })
.or_else(|| { .or_else(|| {
let opener = Opener::from_token(first.kind)?; let opener = Opener::from_token(first.kind)?;
if opener.bidirectional() let whitespace_after = self
&& self
.input .input
.peek() .lexer
.map_or(true, |t| matches!(t.kind, lex::Kind::Whitespace)) .ahead()
{ .chars()
.next()
.map_or(true, char::is_whitespace);
if opener.bidirectional() && whitespace_after {
return None; return None;
} }
let whitespace_before = self.events.back().map_or(false, |ev| {
ev.span
.of(self.input.src)
.chars()
.last()
.map_or(false, char::is_whitespace)
});
if matches!(opener, Opener::SingleQuoted | Opener::DoubleQuoted) if matches!(opener, Opener::SingleQuoted | Opener::DoubleQuoted)
&& self && self
.events .events
.back() .back()
.map_or(false, |ev| matches!(ev.kind, EventKind::Str)) .map_or(false, |ev| matches!(ev.kind, EventKind::Str))
&& !whitespace_before
{ {
return None; return None;
} }
@ -675,10 +690,8 @@ impl<'s> Parser<'s> {
fn merge_str_events(&mut self, span_str: Span) -> Event { fn merge_str_events(&mut self, span_str: Span) -> Event {
let mut span = span_str; let mut span = span_str;
let should_merge = |e: &Event, span: Span| { let should_merge = |e: &Event, span: Span| {
matches!( matches!(e.kind, EventKind::Str | EventKind::Placeholder)
e.kind, && span.end() == e.span.start()
EventKind::Str | EventKind::Whitespace | EventKind::Placeholder
) && span.end() == e.span.start()
}; };
while self.events.front().map_or(false, |e| should_merge(e, span)) { while self.events.front().map_or(false, |e| should_merge(e, span)) {
let ev = self.events.pop_front().unwrap(); let ev = self.events.pop_front().unwrap();
@ -882,9 +895,7 @@ impl<'s> Iterator for Parser<'s> {
|| self // for merge or attributes || self // for merge or attributes
.events .events
.back() .back()
.map_or(false, |ev| { .map_or(false, |ev| matches!(ev.kind, EventKind::Str))
matches!(ev.kind, EventKind::Str | EventKind::Whitespace)
})
{ {
if self.parse_event().is_none() { if self.parse_event().is_none() {
if self.input.complete { if self.input.complete {
@ -911,7 +922,7 @@ impl<'s> Iterator for Parser<'s> {
self.events.pop_front().and_then(|e| match e.kind { self.events.pop_front().and_then(|e| match e.kind {
EventKind::Str if e.span.is_empty() => self.next(), EventKind::Str if e.span.is_empty() => self.next(),
EventKind::Str | EventKind::Whitespace => Some(self.merge_str_events(e.span)), EventKind::Str => Some(self.merge_str_events(e.span)),
EventKind::Placeholder | EventKind::Attributes { container: false } => self.next(), EventKind::Placeholder | EventKind::Attributes { container: false } => self.next(),
_ => Some(e), _ => Some(e),
}) })

View file

@ -13,7 +13,6 @@ pub(crate) struct Token {
pub enum Kind { pub enum Kind {
Text, Text,
Newline, Newline,
Whitespace,
Nbsp, Nbsp,
Hardbreak, Hardbreak,
Escape, Escape,
@ -167,6 +166,8 @@ impl<'s> Lexer<'s> {
_ if escape && first == ' ' => Nbsp, _ if escape && first == ' ' => Nbsp,
_ if escape => Text, _ if escape => Text,
'\n' => Newline,
'\\' => { '\\' => {
if self if self
.peek_char() .peek_char()
@ -179,12 +180,6 @@ impl<'s> Lexer<'s> {
} }
} }
'\n' => Newline,
_ if first.is_whitespace() => {
self.eat_while(char::is_whitespace);
Whitespace
}
'[' => Open(Bracket), '[' => Open(Bracket),
']' => Close(Bracket), ']' => Close(Bracket),
'(' => Open(Paren), '(' => Open(Paren),
@ -323,18 +318,11 @@ mod test {
test_lex!("abc", Text.l(3)); test_lex!("abc", Text.l(3));
test_lex!( test_lex!(
"para w/ some _emphasis_ and *strong*.", "para w/ some _emphasis_ and *strong*.",
Text.l(4), Text.l(13),
Whitespace.l(1),
Text.l(2),
Whitespace.l(1),
Text.l(4),
Whitespace.l(1),
Sym(Underscore).l(1), Sym(Underscore).l(1),
Text.l(8), Text.l(8),
Sym(Underscore).l(1), Sym(Underscore).l(1),
Whitespace.l(1), Text.l(5),
Text.l(3),
Whitespace.l(1),
Sym(Asterisk).l(1), Sym(Asterisk).l(1),
Text.l(6), Text.l(6),
Sym(Asterisk).l(1), Sym(Asterisk).l(1),

View file

@ -910,9 +910,7 @@ impl<'s> Parser<'s> {
inline::Atom::Escape => Event::Escape, inline::Atom::Escape => Event::Escape,
}, },
inline::EventKind::Str => Event::Str(inline.span.of(self.src).into()), inline::EventKind::Str => Event::Str(inline.span.of(self.src).into()),
inline::EventKind::Whitespace inline::EventKind::Attributes { .. } | inline::EventKind::Placeholder => {
| inline::EventKind::Attributes { .. }
| inline::EventKind::Placeholder => {
panic!("{:?}", inline) panic!("{:?}", inline)
} }
} }