lex: rm lex::Kind::Whitespace

Whitespace tokens do not necessarily create new events but they work as a delimiter for words with attributes and affect some container delimiters. Now when we can read the source from inline, we can instead inspect for whitespace when needed. Removing the whitespace token allows the lexer to continue a lot longer without stopping. E.g. a typical line in a paragraph with no special characters can turn into a single token.
2023-02-19 18:28:00 +01:00 · 2023-02-19 18:28:00 +01:00 · 86ee4ee520
commit 86ee4ee520
parent 9454a2e393
3 changed files with 40 additions and 43 deletions
--- a/src/inline.rs
+++ b/src/inline.rs
@ -62,7 +62,6 @@ pub enum EventKind {
    Exit(Container),
    Atom(Atom),
    Str,
    Whitespace,
    Attributes { container: bool },
    Placeholder,
 }
@ -240,11 +239,7 @@ impl<'s> Parser<'s> {
                .or_else(|| self.parse_container(&first))
                .or_else(|| self.parse_atom(&first))
                .unwrap_or_else(|| {
-                    self.push(if matches!(first.kind, lex::Kind::Whitespace) {
+                    self.push(EventKind::Str);
                        EventKind::Whitespace
                    } else {
                        EventKind::Str
                    });
                })
        })
    }
@ -296,7 +291,13 @@ impl<'s> Parser<'s> {
                self.verbatim = None;
            } else {
                // continue verbatim
-                if matches!(first.kind, lex::Kind::Whitespace) {
+                let is_whitespace = self
                    .input
                    .span
                    .of(self.input.src)
                    .chars()
                    .all(char::is_whitespace);
                if is_whitespace {
                    if !*non_whitespace_encountered
                        && self.input.peek().map_or(false, |t| {
                            matches!(
@ -489,10 +490,14 @@ impl<'s> Parser<'s> {
                    // empty container
                    return None;
                }
-                let whitespace_after = self.events.back().map_or(false, |ev| {
+                let whitespace_before = self.events.back().map_or(false, |ev| {
-                    matches!(ev.kind, EventKind::Whitespace | EventKind::Atom(Softbreak))
+                    ev.span
                        .of(self.input.src)
                        .chars()
                        .last()
                        .map_or(false, char::is_whitespace)
                });
-                if opener.bidirectional() && whitespace_after {
+                if opener.bidirectional() && whitespace_before {
                    return None;
                }
@ -577,19 +582,29 @@ impl<'s> Parser<'s> {
            })
            .or_else(|| {
                let opener = Opener::from_token(first.kind)?;
-                if opener.bidirectional()
+                let whitespace_after = self
                    && self
                    .input
-                        .peek()
+                    .lexer
-                        .map_or(true, |t| matches!(t.kind, lex::Kind::Whitespace))
+                    .ahead()
-                {
+                    .chars()
                    .next()
                    .map_or(true, char::is_whitespace);
                if opener.bidirectional() && whitespace_after {
                    return None;
                }
                let whitespace_before = self.events.back().map_or(false, |ev| {
                    ev.span
                        .of(self.input.src)
                        .chars()
                        .last()
                        .map_or(false, char::is_whitespace)
                });
                if matches!(opener, Opener::SingleQuoted | Opener::DoubleQuoted)
                    && self
                        .events
                        .back()
                        .map_or(false, |ev| matches!(ev.kind, EventKind::Str))
                    && !whitespace_before
                {
                    return None;
                }
@ -675,10 +690,8 @@ impl<'s> Parser<'s> {
    fn merge_str_events(&mut self, span_str: Span) -> Event {
        let mut span = span_str;
        let should_merge = |e: &Event, span: Span| {
-            matches!(
+            matches!(e.kind, EventKind::Str | EventKind::Placeholder)
-                e.kind,
+                && span.end() == e.span.start()
                EventKind::Str | EventKind::Whitespace | EventKind::Placeholder
            ) && span.end() == e.span.start()
        };
        while self.events.front().map_or(false, |e| should_merge(e, span)) {
            let ev = self.events.pop_front().unwrap();
@ -882,9 +895,7 @@ impl<'s> Iterator for Parser<'s> {
            || self // for merge or attributes
                .events
                .back()
-                .map_or(false, |ev| {
+                .map_or(false, |ev| matches!(ev.kind, EventKind::Str))
                    matches!(ev.kind, EventKind::Str | EventKind::Whitespace)
                })
        {
            if self.parse_event().is_none() {
                if self.input.complete {
@ -911,7 +922,7 @@ impl<'s> Iterator for Parser<'s> {
        self.events.pop_front().and_then(|e| match e.kind {
            EventKind::Str if e.span.is_empty() => self.next(),
-            EventKind::Str | EventKind::Whitespace => Some(self.merge_str_events(e.span)),
+            EventKind::Str => Some(self.merge_str_events(e.span)),
            EventKind::Placeholder | EventKind::Attributes { container: false } => self.next(),
            _ => Some(e),
        })
--- a/src/lex.rs
+++ b/src/lex.rs
@ -13,7 +13,6 @@ pub(crate) struct Token {
 pub enum Kind {
    Text,
    Newline,
    Whitespace,
    Nbsp,
    Hardbreak,
    Escape,
@ -167,6 +166,8 @@ impl<'s> Lexer<'s> {
            _ if escape && first == ' ' => Nbsp,
            _ if escape => Text,
            '\n' => Newline,
            '\\' => {
                if self
                    .peek_char()
@ -179,12 +180,6 @@ impl<'s> Lexer<'s> {
                }
            }
            '\n' => Newline,
            _ if first.is_whitespace() => {
                self.eat_while(char::is_whitespace);
                Whitespace
            }
            '[' => Open(Bracket),
            ']' => Close(Bracket),
            '(' => Open(Paren),
@ -323,18 +318,11 @@ mod test {
        test_lex!("abc", Text.l(3));
        test_lex!(
            "para w/ some _emphasis_ and *strong*.",
-            Text.l(4),
+            Text.l(13),
            Whitespace.l(1),
            Text.l(2),
            Whitespace.l(1),
            Text.l(4),
            Whitespace.l(1),
            Sym(Underscore).l(1),
            Text.l(8),
            Sym(Underscore).l(1),
-            Whitespace.l(1),
+            Text.l(5),
            Text.l(3),
            Whitespace.l(1),
            Sym(Asterisk).l(1),
            Text.l(6),
            Sym(Asterisk).l(1),
--- a/src/lib.rs
+++ b/src/lib.rs
@ -910,9 +910,7 @@ impl<'s> Parser<'s> {
                    inline::Atom::Escape => Event::Escape,
                },
                inline::EventKind::Str => Event::Str(inline.span.of(self.src).into()),
-                inline::EventKind::Whitespace
+                inline::EventKind::Attributes { .. } | inline::EventKind::Placeholder => {
                | inline::EventKind::Attributes { .. }
                | inline::EventKind::Placeholder => {
                    panic!("{:?}", inline)
                }
            }