From 86ee4ee520cdd74b0ba4ac44eca04d112ccd3f42 Mon Sep 17 00:00:00 2001
From: Noah Hellman <noah@hllmn.net>
Date: Sun, 19 Feb 2023 18:28:00 +0100
Subject: [PATCH] lex: rm lex::Kind::Whitespace

Whitespace tokens do not necessarily create new events but they work as
a delimiter for words with attributes and affect some container
delimiters. Now when we can read the source from inline, we can instead
inspect for whitespace when needed.

Removing the whitespace token allows the lexer to continue a lot longer
without stopping. E.g. a typical line in a paragraph with no special
characters can turn into a single token.
---
 src/inline.rs | 59 ++++++++++++++++++++++++++++++---------------------
 src/lex.rs    | 20 ++++-------------
 src/lib.rs    |  4 +---
 3 files changed, 40 insertions(+), 43 deletions(-)

diff --git a/src/inline.rs b/src/inline.rs
index 947288c..73e2551 100644
--- a/src/inline.rs
+++ b/src/inline.rs
@@ -62,7 +62,6 @@ pub enum EventKind {
     Exit(Container),
     Atom(Atom),
     Str,
-    Whitespace,
     Attributes { container: bool },
     Placeholder,
 }
@@ -240,11 +239,7 @@ impl<'s> Parser<'s> {
                 .or_else(|| self.parse_container(&first))
                 .or_else(|| self.parse_atom(&first))
                 .unwrap_or_else(|| {
-                    self.push(if matches!(first.kind, lex::Kind::Whitespace) {
-                        EventKind::Whitespace
-                    } else {
-                        EventKind::Str
-                    });
+                    self.push(EventKind::Str);
                 })
         })
     }
@@ -296,7 +291,13 @@ impl<'s> Parser<'s> {
                 self.verbatim = None;
             } else {
                 // continue verbatim
-                if matches!(first.kind, lex::Kind::Whitespace) {
+                let is_whitespace = self
+                    .input
+                    .span
+                    .of(self.input.src)
+                    .chars()
+                    .all(char::is_whitespace);
+                if is_whitespace {
                     if !*non_whitespace_encountered
                         && self.input.peek().map_or(false, |t| {
                             matches!(
@@ -489,10 +490,14 @@ impl<'s> Parser<'s> {
                     // empty container
                     return None;
                 }
-                let whitespace_after = self.events.back().map_or(false, |ev| {
-                    matches!(ev.kind, EventKind::Whitespace | EventKind::Atom(Softbreak))
+                let whitespace_before = self.events.back().map_or(false, |ev| {
+                    ev.span
+                        .of(self.input.src)
+                        .chars()
+                        .last()
+                        .map_or(false, char::is_whitespace)
                 });
-                if opener.bidirectional() && whitespace_after {
+                if opener.bidirectional() && whitespace_before {
                     return None;
                 }
 
@@ -577,19 +582,29 @@ impl<'s> Parser<'s> {
             })
             .or_else(|| {
                 let opener = Opener::from_token(first.kind)?;
-                if opener.bidirectional()
-                    && self
-                        .input
-                        .peek()
-                        .map_or(true, |t| matches!(t.kind, lex::Kind::Whitespace))
-                {
+                let whitespace_after = self
+                    .input
+                    .lexer
+                    .ahead()
+                    .chars()
+                    .next()
+                    .map_or(true, char::is_whitespace);
+                if opener.bidirectional() && whitespace_after {
                     return None;
                 }
+                let whitespace_before = self.events.back().map_or(false, |ev| {
+                    ev.span
+                        .of(self.input.src)
+                        .chars()
+                        .last()
+                        .map_or(false, char::is_whitespace)
+                });
                 if matches!(opener, Opener::SingleQuoted | Opener::DoubleQuoted)
                     && self
                         .events
                         .back()
                         .map_or(false, |ev| matches!(ev.kind, EventKind::Str))
+                    && !whitespace_before
                 {
                     return None;
                 }
@@ -675,10 +690,8 @@ impl<'s> Parser<'s> {
     fn merge_str_events(&mut self, span_str: Span) -> Event {
         let mut span = span_str;
         let should_merge = |e: &Event, span: Span| {
-            matches!(
-                e.kind,
-                EventKind::Str | EventKind::Whitespace | EventKind::Placeholder
-            ) && span.end() == e.span.start()
+            matches!(e.kind, EventKind::Str | EventKind::Placeholder)
+                && span.end() == e.span.start()
         };
         while self.events.front().map_or(false, |e| should_merge(e, span)) {
             let ev = self.events.pop_front().unwrap();
@@ -882,9 +895,7 @@ impl<'s> Iterator for Parser<'s> {
             || self // for merge or attributes
                 .events
                 .back()
-                .map_or(false, |ev| {
-                    matches!(ev.kind, EventKind::Str | EventKind::Whitespace)
-                })
+                .map_or(false, |ev| matches!(ev.kind, EventKind::Str))
         {
             if self.parse_event().is_none() {
                 if self.input.complete {
@@ -911,7 +922,7 @@ impl<'s> Iterator for Parser<'s> {
 
         self.events.pop_front().and_then(|e| match e.kind {
             EventKind::Str if e.span.is_empty() => self.next(),
-            EventKind::Str | EventKind::Whitespace => Some(self.merge_str_events(e.span)),
+            EventKind::Str => Some(self.merge_str_events(e.span)),
             EventKind::Placeholder | EventKind::Attributes { container: false } => self.next(),
             _ => Some(e),
         })
diff --git a/src/lex.rs b/src/lex.rs
index 1dd4bea..6efee3d 100644
--- a/src/lex.rs
+++ b/src/lex.rs
@@ -13,7 +13,6 @@ pub(crate) struct Token {
 pub enum Kind {
     Text,
     Newline,
-    Whitespace,
     Nbsp,
     Hardbreak,
     Escape,
@@ -167,6 +166,8 @@ impl<'s> Lexer<'s> {
             _ if escape && first == ' ' => Nbsp,
             _ if escape => Text,
 
+            '\n' => Newline,
+
             '\\' => {
                 if self
                     .peek_char()
@@ -179,12 +180,6 @@ impl<'s> Lexer<'s> {
                 }
             }
 
-            '\n' => Newline,
-            _ if first.is_whitespace() => {
-                self.eat_while(char::is_whitespace);
-                Whitespace
-            }
-
             '[' => Open(Bracket),
             ']' => Close(Bracket),
             '(' => Open(Paren),
@@ -323,18 +318,11 @@ mod test {
         test_lex!("abc", Text.l(3));
         test_lex!(
             "para w/ some _emphasis_ and *strong*.",
-            Text.l(4),
-            Whitespace.l(1),
-            Text.l(2),
-            Whitespace.l(1),
-            Text.l(4),
-            Whitespace.l(1),
+            Text.l(13),
             Sym(Underscore).l(1),
             Text.l(8),
             Sym(Underscore).l(1),
-            Whitespace.l(1),
-            Text.l(3),
-            Whitespace.l(1),
+            Text.l(5),
             Sym(Asterisk).l(1),
             Text.l(6),
             Sym(Asterisk).l(1),
diff --git a/src/lib.rs b/src/lib.rs
index e02f78f..7a8768b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -910,9 +910,7 @@ impl<'s> Parser<'s> {
                     inline::Atom::Escape => Event::Escape,
                 },
                 inline::EventKind::Str => Event::Str(inline.span.of(self.src).into()),
-                inline::EventKind::Whitespace
-                | inline::EventKind::Attributes { .. }
-                | inline::EventKind::Placeholder => {
+                inline::EventKind::Attributes { .. } | inline::EventKind::Placeholder => {
                     panic!("{:?}", inline)
                 }
             }