lex: rm lex::Kind::Whitespace
Whitespace tokens do not necessarily create new events but they work as a delimiter for words with attributes and affect some container delimiters. Now when we can read the source from inline, we can instead inspect for whitespace when needed. Removing the whitespace token allows the lexer to continue a lot longer without stopping. E.g. a typical line in a paragraph with no special characters can turn into a single token.
This commit is contained in:
parent
9454a2e393
commit
86ee4ee520
3 changed files with 40 additions and 43 deletions
|
@ -62,7 +62,6 @@ pub enum EventKind {
|
||||||
Exit(Container),
|
Exit(Container),
|
||||||
Atom(Atom),
|
Atom(Atom),
|
||||||
Str,
|
Str,
|
||||||
Whitespace,
|
|
||||||
Attributes { container: bool },
|
Attributes { container: bool },
|
||||||
Placeholder,
|
Placeholder,
|
||||||
}
|
}
|
||||||
|
@ -240,11 +239,7 @@ impl<'s> Parser<'s> {
|
||||||
.or_else(|| self.parse_container(&first))
|
.or_else(|| self.parse_container(&first))
|
||||||
.or_else(|| self.parse_atom(&first))
|
.or_else(|| self.parse_atom(&first))
|
||||||
.unwrap_or_else(|| {
|
.unwrap_or_else(|| {
|
||||||
self.push(if matches!(first.kind, lex::Kind::Whitespace) {
|
self.push(EventKind::Str);
|
||||||
EventKind::Whitespace
|
|
||||||
} else {
|
|
||||||
EventKind::Str
|
|
||||||
});
|
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -296,7 +291,13 @@ impl<'s> Parser<'s> {
|
||||||
self.verbatim = None;
|
self.verbatim = None;
|
||||||
} else {
|
} else {
|
||||||
// continue verbatim
|
// continue verbatim
|
||||||
if matches!(first.kind, lex::Kind::Whitespace) {
|
let is_whitespace = self
|
||||||
|
.input
|
||||||
|
.span
|
||||||
|
.of(self.input.src)
|
||||||
|
.chars()
|
||||||
|
.all(char::is_whitespace);
|
||||||
|
if is_whitespace {
|
||||||
if !*non_whitespace_encountered
|
if !*non_whitespace_encountered
|
||||||
&& self.input.peek().map_or(false, |t| {
|
&& self.input.peek().map_or(false, |t| {
|
||||||
matches!(
|
matches!(
|
||||||
|
@ -489,10 +490,14 @@ impl<'s> Parser<'s> {
|
||||||
// empty container
|
// empty container
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
let whitespace_after = self.events.back().map_or(false, |ev| {
|
let whitespace_before = self.events.back().map_or(false, |ev| {
|
||||||
matches!(ev.kind, EventKind::Whitespace | EventKind::Atom(Softbreak))
|
ev.span
|
||||||
|
.of(self.input.src)
|
||||||
|
.chars()
|
||||||
|
.last()
|
||||||
|
.map_or(false, char::is_whitespace)
|
||||||
});
|
});
|
||||||
if opener.bidirectional() && whitespace_after {
|
if opener.bidirectional() && whitespace_before {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -577,19 +582,29 @@ impl<'s> Parser<'s> {
|
||||||
})
|
})
|
||||||
.or_else(|| {
|
.or_else(|| {
|
||||||
let opener = Opener::from_token(first.kind)?;
|
let opener = Opener::from_token(first.kind)?;
|
||||||
if opener.bidirectional()
|
let whitespace_after = self
|
||||||
&& self
|
|
||||||
.input
|
.input
|
||||||
.peek()
|
.lexer
|
||||||
.map_or(true, |t| matches!(t.kind, lex::Kind::Whitespace))
|
.ahead()
|
||||||
{
|
.chars()
|
||||||
|
.next()
|
||||||
|
.map_or(true, char::is_whitespace);
|
||||||
|
if opener.bidirectional() && whitespace_after {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
let whitespace_before = self.events.back().map_or(false, |ev| {
|
||||||
|
ev.span
|
||||||
|
.of(self.input.src)
|
||||||
|
.chars()
|
||||||
|
.last()
|
||||||
|
.map_or(false, char::is_whitespace)
|
||||||
|
});
|
||||||
if matches!(opener, Opener::SingleQuoted | Opener::DoubleQuoted)
|
if matches!(opener, Opener::SingleQuoted | Opener::DoubleQuoted)
|
||||||
&& self
|
&& self
|
||||||
.events
|
.events
|
||||||
.back()
|
.back()
|
||||||
.map_or(false, |ev| matches!(ev.kind, EventKind::Str))
|
.map_or(false, |ev| matches!(ev.kind, EventKind::Str))
|
||||||
|
&& !whitespace_before
|
||||||
{
|
{
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
@ -675,10 +690,8 @@ impl<'s> Parser<'s> {
|
||||||
fn merge_str_events(&mut self, span_str: Span) -> Event {
|
fn merge_str_events(&mut self, span_str: Span) -> Event {
|
||||||
let mut span = span_str;
|
let mut span = span_str;
|
||||||
let should_merge = |e: &Event, span: Span| {
|
let should_merge = |e: &Event, span: Span| {
|
||||||
matches!(
|
matches!(e.kind, EventKind::Str | EventKind::Placeholder)
|
||||||
e.kind,
|
&& span.end() == e.span.start()
|
||||||
EventKind::Str | EventKind::Whitespace | EventKind::Placeholder
|
|
||||||
) && span.end() == e.span.start()
|
|
||||||
};
|
};
|
||||||
while self.events.front().map_or(false, |e| should_merge(e, span)) {
|
while self.events.front().map_or(false, |e| should_merge(e, span)) {
|
||||||
let ev = self.events.pop_front().unwrap();
|
let ev = self.events.pop_front().unwrap();
|
||||||
|
@ -882,9 +895,7 @@ impl<'s> Iterator for Parser<'s> {
|
||||||
|| self // for merge or attributes
|
|| self // for merge or attributes
|
||||||
.events
|
.events
|
||||||
.back()
|
.back()
|
||||||
.map_or(false, |ev| {
|
.map_or(false, |ev| matches!(ev.kind, EventKind::Str))
|
||||||
matches!(ev.kind, EventKind::Str | EventKind::Whitespace)
|
|
||||||
})
|
|
||||||
{
|
{
|
||||||
if self.parse_event().is_none() {
|
if self.parse_event().is_none() {
|
||||||
if self.input.complete {
|
if self.input.complete {
|
||||||
|
@ -911,7 +922,7 @@ impl<'s> Iterator for Parser<'s> {
|
||||||
|
|
||||||
self.events.pop_front().and_then(|e| match e.kind {
|
self.events.pop_front().and_then(|e| match e.kind {
|
||||||
EventKind::Str if e.span.is_empty() => self.next(),
|
EventKind::Str if e.span.is_empty() => self.next(),
|
||||||
EventKind::Str | EventKind::Whitespace => Some(self.merge_str_events(e.span)),
|
EventKind::Str => Some(self.merge_str_events(e.span)),
|
||||||
EventKind::Placeholder | EventKind::Attributes { container: false } => self.next(),
|
EventKind::Placeholder | EventKind::Attributes { container: false } => self.next(),
|
||||||
_ => Some(e),
|
_ => Some(e),
|
||||||
})
|
})
|
||||||
|
|
20
src/lex.rs
20
src/lex.rs
|
@ -13,7 +13,6 @@ pub(crate) struct Token {
|
||||||
pub enum Kind {
|
pub enum Kind {
|
||||||
Text,
|
Text,
|
||||||
Newline,
|
Newline,
|
||||||
Whitespace,
|
|
||||||
Nbsp,
|
Nbsp,
|
||||||
Hardbreak,
|
Hardbreak,
|
||||||
Escape,
|
Escape,
|
||||||
|
@ -167,6 +166,8 @@ impl<'s> Lexer<'s> {
|
||||||
_ if escape && first == ' ' => Nbsp,
|
_ if escape && first == ' ' => Nbsp,
|
||||||
_ if escape => Text,
|
_ if escape => Text,
|
||||||
|
|
||||||
|
'\n' => Newline,
|
||||||
|
|
||||||
'\\' => {
|
'\\' => {
|
||||||
if self
|
if self
|
||||||
.peek_char()
|
.peek_char()
|
||||||
|
@ -179,12 +180,6 @@ impl<'s> Lexer<'s> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
'\n' => Newline,
|
|
||||||
_ if first.is_whitespace() => {
|
|
||||||
self.eat_while(char::is_whitespace);
|
|
||||||
Whitespace
|
|
||||||
}
|
|
||||||
|
|
||||||
'[' => Open(Bracket),
|
'[' => Open(Bracket),
|
||||||
']' => Close(Bracket),
|
']' => Close(Bracket),
|
||||||
'(' => Open(Paren),
|
'(' => Open(Paren),
|
||||||
|
@ -323,18 +318,11 @@ mod test {
|
||||||
test_lex!("abc", Text.l(3));
|
test_lex!("abc", Text.l(3));
|
||||||
test_lex!(
|
test_lex!(
|
||||||
"para w/ some _emphasis_ and *strong*.",
|
"para w/ some _emphasis_ and *strong*.",
|
||||||
Text.l(4),
|
Text.l(13),
|
||||||
Whitespace.l(1),
|
|
||||||
Text.l(2),
|
|
||||||
Whitespace.l(1),
|
|
||||||
Text.l(4),
|
|
||||||
Whitespace.l(1),
|
|
||||||
Sym(Underscore).l(1),
|
Sym(Underscore).l(1),
|
||||||
Text.l(8),
|
Text.l(8),
|
||||||
Sym(Underscore).l(1),
|
Sym(Underscore).l(1),
|
||||||
Whitespace.l(1),
|
Text.l(5),
|
||||||
Text.l(3),
|
|
||||||
Whitespace.l(1),
|
|
||||||
Sym(Asterisk).l(1),
|
Sym(Asterisk).l(1),
|
||||||
Text.l(6),
|
Text.l(6),
|
||||||
Sym(Asterisk).l(1),
|
Sym(Asterisk).l(1),
|
||||||
|
|
|
@ -910,9 +910,7 @@ impl<'s> Parser<'s> {
|
||||||
inline::Atom::Escape => Event::Escape,
|
inline::Atom::Escape => Event::Escape,
|
||||||
},
|
},
|
||||||
inline::EventKind::Str => Event::Str(inline.span.of(self.src).into()),
|
inline::EventKind::Str => Event::Str(inline.span.of(self.src).into()),
|
||||||
inline::EventKind::Whitespace
|
inline::EventKind::Attributes { .. } | inline::EventKind::Placeholder => {
|
||||||
| inline::EventKind::Attributes { .. }
|
|
||||||
| inline::EventKind::Placeholder => {
|
|
||||||
panic!("{:?}", inline)
|
panic!("{:?}", inline)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue