From 86ee4ee520cdd74b0ba4ac44eca04d112ccd3f42 Mon Sep 17 00:00:00 2001 From: Noah Hellman Date: Sun, 19 Feb 2023 18:28:00 +0100 Subject: [PATCH] lex: rm lex::Kind::Whitespace Whitespace tokens do not necessarily create new events but they work as a delimiter for words with attributes and affect some container delimiters. Now when we can read the source from inline, we can instead inspect for whitespace when needed. Removing the whitespace token allows the lexer to continue a lot longer without stopping. E.g. a typical line in a paragraph with no special characters can turn into a single token. --- src/inline.rs | 59 ++++++++++++++++++++++++++++++--------------------- src/lex.rs | 20 ++++------------- src/lib.rs | 4 +--- 3 files changed, 40 insertions(+), 43 deletions(-) diff --git a/src/inline.rs b/src/inline.rs index 947288c..73e2551 100644 --- a/src/inline.rs +++ b/src/inline.rs @@ -62,7 +62,6 @@ pub enum EventKind { Exit(Container), Atom(Atom), Str, - Whitespace, Attributes { container: bool }, Placeholder, } @@ -240,11 +239,7 @@ impl<'s> Parser<'s> { .or_else(|| self.parse_container(&first)) .or_else(|| self.parse_atom(&first)) .unwrap_or_else(|| { - self.push(if matches!(first.kind, lex::Kind::Whitespace) { - EventKind::Whitespace - } else { - EventKind::Str - }); + self.push(EventKind::Str); }) }) } @@ -296,7 +291,13 @@ impl<'s> Parser<'s> { self.verbatim = None; } else { // continue verbatim - if matches!(first.kind, lex::Kind::Whitespace) { + let is_whitespace = self + .input + .span + .of(self.input.src) + .chars() + .all(char::is_whitespace); + if is_whitespace { if !*non_whitespace_encountered && self.input.peek().map_or(false, |t| { matches!( @@ -489,10 +490,14 @@ impl<'s> Parser<'s> { // empty container return None; } - let whitespace_after = self.events.back().map_or(false, |ev| { - matches!(ev.kind, EventKind::Whitespace | EventKind::Atom(Softbreak)) + let whitespace_before = self.events.back().map_or(false, |ev| { + ev.span + .of(self.input.src) + .chars() + .last() + .map_or(false, char::is_whitespace) }); - if opener.bidirectional() && whitespace_after { + if opener.bidirectional() && whitespace_before { return None; } @@ -577,19 +582,29 @@ impl<'s> Parser<'s> { }) .or_else(|| { let opener = Opener::from_token(first.kind)?; - if opener.bidirectional() - && self - .input - .peek() - .map_or(true, |t| matches!(t.kind, lex::Kind::Whitespace)) - { + let whitespace_after = self + .input + .lexer + .ahead() + .chars() + .next() + .map_or(true, char::is_whitespace); + if opener.bidirectional() && whitespace_after { return None; } + let whitespace_before = self.events.back().map_or(false, |ev| { + ev.span + .of(self.input.src) + .chars() + .last() + .map_or(false, char::is_whitespace) + }); if matches!(opener, Opener::SingleQuoted | Opener::DoubleQuoted) && self .events .back() .map_or(false, |ev| matches!(ev.kind, EventKind::Str)) + && !whitespace_before { return None; } @@ -675,10 +690,8 @@ impl<'s> Parser<'s> { fn merge_str_events(&mut self, span_str: Span) -> Event { let mut span = span_str; let should_merge = |e: &Event, span: Span| { - matches!( - e.kind, - EventKind::Str | EventKind::Whitespace | EventKind::Placeholder - ) && span.end() == e.span.start() + matches!(e.kind, EventKind::Str | EventKind::Placeholder) + && span.end() == e.span.start() }; while self.events.front().map_or(false, |e| should_merge(e, span)) { let ev = self.events.pop_front().unwrap(); @@ -882,9 +895,7 @@ impl<'s> Iterator for Parser<'s> { || self // for merge or attributes .events .back() - .map_or(false, |ev| { - matches!(ev.kind, EventKind::Str | EventKind::Whitespace) - }) + .map_or(false, |ev| matches!(ev.kind, EventKind::Str)) { if self.parse_event().is_none() { if self.input.complete { @@ -911,7 +922,7 @@ impl<'s> Iterator for Parser<'s> { self.events.pop_front().and_then(|e| match e.kind { EventKind::Str if e.span.is_empty() => self.next(), - EventKind::Str | EventKind::Whitespace => Some(self.merge_str_events(e.span)), + EventKind::Str => Some(self.merge_str_events(e.span)), EventKind::Placeholder | EventKind::Attributes { container: false } => self.next(), _ => Some(e), }) diff --git a/src/lex.rs b/src/lex.rs index 1dd4bea..6efee3d 100644 --- a/src/lex.rs +++ b/src/lex.rs @@ -13,7 +13,6 @@ pub(crate) struct Token { pub enum Kind { Text, Newline, - Whitespace, Nbsp, Hardbreak, Escape, @@ -167,6 +166,8 @@ impl<'s> Lexer<'s> { _ if escape && first == ' ' => Nbsp, _ if escape => Text, + '\n' => Newline, + '\\' => { if self .peek_char() @@ -179,12 +180,6 @@ impl<'s> Lexer<'s> { } } - '\n' => Newline, - _ if first.is_whitespace() => { - self.eat_while(char::is_whitespace); - Whitespace - } - '[' => Open(Bracket), ']' => Close(Bracket), '(' => Open(Paren), @@ -323,18 +318,11 @@ mod test { test_lex!("abc", Text.l(3)); test_lex!( "para w/ some _emphasis_ and *strong*.", - Text.l(4), - Whitespace.l(1), - Text.l(2), - Whitespace.l(1), - Text.l(4), - Whitespace.l(1), + Text.l(13), Sym(Underscore).l(1), Text.l(8), Sym(Underscore).l(1), - Whitespace.l(1), - Text.l(3), - Whitespace.l(1), + Text.l(5), Sym(Asterisk).l(1), Text.l(6), Sym(Asterisk).l(1), diff --git a/src/lib.rs b/src/lib.rs index e02f78f..7a8768b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -910,9 +910,7 @@ impl<'s> Parser<'s> { inline::Atom::Escape => Event::Escape, }, inline::EventKind::Str => Event::Str(inline.span.of(self.src).into()), - inline::EventKind::Whitespace - | inline::EventKind::Attributes { .. } - | inline::EventKind::Placeholder => { + inline::EventKind::Attributes { .. } | inline::EventKind::Placeholder => { panic!("{:?}", inline) } }