jotdown/src/inline.rs

549 lines
16 KiB
Rust
Raw Normal View History

2022-11-21 13:44:59 -05:00
use crate::lex;
2022-11-21 16:40:11 -05:00
use crate::Span;
2022-11-16 16:11:55 -05:00
2022-11-21 13:44:59 -05:00
use lex::Delimiter;
use lex::Symbol;
2022-11-16 16:11:55 -05:00
use Atom::*;
use Container::*;
#[derive(Debug, Clone, PartialEq, Eq)]
2022-11-21 13:44:59 -05:00
pub enum Atom {
2022-11-16 16:11:55 -05:00
Softbreak,
Hardbreak,
Escape,
2022-11-21 13:44:59 -05:00
Nbsp,
2022-11-28 18:33:43 -05:00
Ellipsis,
2022-11-21 13:44:59 -05:00
EnDash,
2022-12-01 12:09:09 -05:00
EmDash,
2022-11-16 16:11:55 -05:00
}
2022-11-20 13:13:48 -05:00
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
2022-11-16 16:11:55 -05:00
pub enum Container {
Span,
// typesetting
Subscript,
Superscript,
Insert,
Delete,
2022-11-21 13:44:59 -05:00
Emphasis,
2022-11-16 16:11:55 -05:00
Strong,
2022-11-27 17:56:19 -05:00
Mark,
2022-11-16 16:11:55 -05:00
// smart quoting
SingleQuoted,
DoubleQuoted,
2022-12-08 11:42:54 -05:00
// Verbatim
Verbatim,
RawFormat,
InlineMath,
DisplayMath,
2022-12-11 03:26:55 -05:00
// Links
ReferenceLink,
InlineLink,
AutoLink,
2022-11-16 16:11:55 -05:00
}
2022-11-21 16:40:11 -05:00
#[derive(Debug, PartialEq, Eq)]
2022-11-27 18:10:28 -05:00
pub enum EventKind {
2022-11-28 13:08:49 -05:00
Enter(Container),
2022-11-22 13:19:21 -05:00
Exit(Container),
2022-11-21 13:44:59 -05:00
Atom(Atom),
2022-12-08 11:42:54 -05:00
Str,
2022-12-11 03:26:55 -05:00
Attributes,
2022-11-20 13:13:48 -05:00
}
2022-11-27 18:10:28 -05:00
#[derive(Debug, PartialEq, Eq)]
pub struct Event {
pub kind: EventKind,
pub span: Span,
}
2022-12-11 03:26:55 -05:00
/// Current parsing state of elements that are not recursive, i.e. may not contain arbitrary inline
/// elements, can only be one of these at a time.
2022-12-11 04:45:05 -05:00
#[derive(Debug)]
2022-12-11 03:26:55 -05:00
enum State {
None,
/// Within a verbatim element, e.g. '$`xxxxx'
Verbatim {
kind: Container,
opener_len: usize,
2022-12-11 04:45:05 -05:00
opener_event: usize,
2022-12-11 03:26:55 -05:00
},
/// Potentially within an attribute list, e.g. '{a=b '.
Attributes {
comment: bool,
},
/// Potentially within an autolink URL or an inline link URL, e.g. '<https://' or
/// '[text](https://'.
Url {
auto: bool,
},
/// Potentially within a reference link tag, e.g. '[text][tag '
ReferenceLinkTag,
}
impl State {
2022-12-11 04:45:05 -05:00
fn verbatim(&self) -> Option<(Container, usize, usize)> {
if let Self::Verbatim {
kind,
opener_len,
opener_event,
} = self
{
Some((*kind, *opener_len, *opener_event))
2022-12-11 03:26:55 -05:00
} else {
None
}
}
2022-11-20 13:13:48 -05:00
}
2022-11-16 16:11:55 -05:00
2022-11-22 13:19:21 -05:00
pub struct Parser<'s> {
2022-11-27 17:56:19 -05:00
openers: Vec<(Container, usize)>,
events: std::collections::VecDeque<Event>,
2022-11-26 19:12:56 -05:00
span: Span,
2022-12-11 03:25:35 -05:00
lexer: lex::Lexer<'s>,
2022-12-08 11:42:54 -05:00
2022-12-11 03:26:55 -05:00
state: State,
2022-12-08 11:42:54 -05:00
last: bool,
2022-11-16 16:11:55 -05:00
}
2022-11-22 13:19:21 -05:00
impl<'s> Parser<'s> {
2022-11-21 13:56:11 -05:00
pub fn new() -> Self {
2022-11-16 16:11:55 -05:00
Self {
openers: Vec::new(),
2022-11-27 17:56:19 -05:00
events: std::collections::VecDeque::new(),
2022-11-26 19:12:56 -05:00
span: Span::new(0, 0),
2022-12-11 03:25:35 -05:00
lexer: lex::Lexer::new(""),
2022-12-08 11:42:54 -05:00
2022-12-11 03:26:55 -05:00
state: State::None,
2022-12-08 11:42:54 -05:00
last: false,
2022-11-20 13:13:48 -05:00
}
}
2022-12-08 11:42:54 -05:00
pub fn parse(&mut self, src: &'s str, last: bool) {
2022-12-11 03:25:35 -05:00
self.lexer = lex::Lexer::new(src);
2022-12-08 11:42:54 -05:00
if last {
assert!(!self.last);
}
self.last = last;
2022-11-22 13:19:21 -05:00
}
2022-11-26 19:12:56 -05:00
fn eat(&mut self) -> Option<lex::Token> {
let tok = self.lexer.next();
if let Some(t) = &tok {
self.span = self.span.extend(t.len);
2022-11-22 13:19:21 -05:00
}
2022-11-26 19:12:56 -05:00
tok
2022-11-21 13:56:11 -05:00
}
2022-11-26 19:12:56 -05:00
fn peek(&mut self) -> Option<&lex::Token> {
self.lexer.peek()
}
2022-11-21 13:56:11 -05:00
2022-11-26 19:12:56 -05:00
fn reset_span(&mut self) {
self.span = Span::empty_at(self.span.end());
2022-11-21 13:56:11 -05:00
}
2022-11-26 19:12:56 -05:00
fn parse_event(&mut self) -> Option<Event> {
self.reset_span();
self.eat().map(|first| {
self.parse_verbatim(&first)
.or_else(|| self.parse_container(&first))
.or_else(|| self.parse_atom(&first))
2022-12-08 11:42:54 -05:00
.unwrap_or(Event {
kind: EventKind::Str,
span: self.span,
})
2022-11-26 19:12:56 -05:00
})
}
2022-11-21 16:40:11 -05:00
2022-11-26 19:12:56 -05:00
fn parse_atom(&mut self, first: &lex::Token) -> Option<Event> {
2022-11-27 18:10:28 -05:00
let atom = match first.kind {
2022-12-08 12:25:24 -05:00
lex::Kind::Newline => Softbreak,
lex::Kind::Hardbreak => Hardbreak,
2022-11-27 18:10:28 -05:00
lex::Kind::Escape => Escape,
lex::Kind::Nbsp => Nbsp,
2022-12-01 12:09:09 -05:00
lex::Kind::Seq(lex::Sequence::Period) if first.len == 3 => Ellipsis,
lex::Kind::Seq(lex::Sequence::Hyphen) if first.len == 2 => EnDash,
lex::Kind::Seq(lex::Sequence::Hyphen) if first.len == 3 => EmDash,
2022-11-27 18:10:28 -05:00
_ => return None,
};
Some(Event {
kind: EventKind::Atom(atom),
span: self.span,
})
2022-11-26 19:12:56 -05:00
}
2022-11-20 13:13:48 -05:00
2022-11-26 19:12:56 -05:00
fn parse_verbatim(&mut self, first: &lex::Token) -> Option<Event> {
2022-12-11 03:26:55 -05:00
self.state
.verbatim()
2022-12-11 04:45:05 -05:00
.map(|(kind, opener_len, opener_event)| {
dbg!(&self.events, opener_event);
assert_eq!(self.events[opener_event].kind, EventKind::Enter(kind));
2022-12-08 11:42:54 -05:00
let kind = if matches!(first.kind, lex::Kind::Seq(lex::Sequence::Backtick))
&& first.len == opener_len
{
2022-12-11 03:26:55 -05:00
self.state = State::None;
2022-12-11 04:45:05 -05:00
let kind =
if matches!(kind, Verbatim) && self.lexer.peek_ahead().starts_with("{=") {
let mut chars = self.lexer.peek_ahead()[2..].chars();
let len = chars
.clone()
.take_while(|c| !c.is_whitespace() && !matches!(c, '{' | '}'))
.count();
if len > 0 && chars.nth(len) == Some('}') {
self.lexer = lex::Lexer::new(chars.as_str());
let span_format = Span::by_len(self.span.end() + "{=".len(), len);
self.events[opener_event].kind = EventKind::Enter(RawFormat);
self.events[opener_event].span = span_format;
self.span = span_format;
RawFormat
} else {
Verbatim
}
} else {
kind
};
EventKind::Exit(kind)
2022-12-08 11:42:54 -05:00
} else {
EventKind::Str
};
Event {
kind,
span: self.span,
}
})
.or_else(|| {
match first.kind {
lex::Kind::Seq(lex::Sequence::Dollar) => {
let math_opt = (first.len <= 2)
.then(|| {
if let Some(lex::Token {
kind: lex::Kind::Seq(lex::Sequence::Backtick),
len,
}) = self.peek()
{
Some((
if first.len == 2 {
2022-12-11 04:45:05 -05:00
DisplayMath
2022-12-08 11:42:54 -05:00
} else {
2022-12-11 04:45:05 -05:00
InlineMath
2022-12-08 11:42:54 -05:00
},
*len,
))
2022-11-27 15:59:54 -05:00
} else {
2022-12-08 11:42:54 -05:00
None
}
})
.flatten();
if math_opt.is_some() {
self.eat(); // backticks
2022-11-21 13:44:59 -05:00
}
2022-12-08 11:42:54 -05:00
math_opt
}
2022-12-11 04:45:05 -05:00
lex::Kind::Seq(lex::Sequence::Backtick) => Some((Verbatim, first.len)),
2022-12-08 11:42:54 -05:00
_ => None,
2022-11-21 13:44:59 -05:00
}
2022-12-08 11:42:54 -05:00
.map(|(kind, opener_len)| {
2022-12-11 04:45:05 -05:00
dbg!(&self.events);
self.state = State::Verbatim {
kind,
opener_len,
opener_event: self.events.len(),
};
2022-12-08 11:42:54 -05:00
Event {
kind: EventKind::Enter(kind),
span: self.span,
}
})
})
2022-11-26 19:12:56 -05:00
}
2022-11-21 13:44:59 -05:00
2022-11-26 19:12:56 -05:00
fn parse_container(&mut self, first: &lex::Token) -> Option<Event> {
2022-12-11 03:26:55 -05:00
enum Dir {
Open,
Close,
Both,
}
2022-11-26 19:12:56 -05:00
match first.kind {
lex::Kind::Sym(Symbol::Asterisk) => Some((Strong, Dir::Both)),
lex::Kind::Sym(Symbol::Underscore) => Some((Emphasis, Dir::Both)),
lex::Kind::Sym(Symbol::Caret) => Some((Superscript, Dir::Both)),
lex::Kind::Sym(Symbol::Tilde) => Some((Subscript, Dir::Both)),
lex::Kind::Sym(Symbol::Quote1) => Some((SingleQuoted, Dir::Both)),
lex::Kind::Sym(Symbol::Quote2) => Some((DoubleQuoted, Dir::Both)),
2022-11-27 15:59:54 -05:00
lex::Kind::Open(Delimiter::Bracket) => Some((Span, Dir::Open)),
lex::Kind::Close(Delimiter::Bracket) => Some((Span, Dir::Close)),
2022-11-26 19:12:56 -05:00
lex::Kind::Open(Delimiter::BraceAsterisk) => Some((Strong, Dir::Open)),
lex::Kind::Close(Delimiter::BraceAsterisk) => Some((Strong, Dir::Close)),
lex::Kind::Open(Delimiter::BraceCaret) => Some((Superscript, Dir::Open)),
lex::Kind::Close(Delimiter::BraceCaret) => Some((Superscript, Dir::Close)),
2022-11-27 17:56:19 -05:00
lex::Kind::Open(Delimiter::BraceEqual) => Some((Mark, Dir::Open)),
lex::Kind::Close(Delimiter::BraceEqual) => Some((Mark, Dir::Close)),
2022-11-26 19:12:56 -05:00
lex::Kind::Open(Delimiter::BraceHyphen) => Some((Delete, Dir::Open)),
lex::Kind::Close(Delimiter::BraceHyphen) => Some((Delete, Dir::Close)),
lex::Kind::Open(Delimiter::BracePlus) => Some((Insert, Dir::Open)),
lex::Kind::Close(Delimiter::BracePlus) => Some((Insert, Dir::Close)),
lex::Kind::Open(Delimiter::BraceTilde) => Some((Subscript, Dir::Open)),
lex::Kind::Close(Delimiter::BraceTilde) => Some((Subscript, Dir::Close)),
lex::Kind::Open(Delimiter::BraceUnderscore) => Some((Emphasis, Dir::Open)),
lex::Kind::Close(Delimiter::BraceUnderscore) => Some((Emphasis, Dir::Close)),
_ => None,
2022-11-20 13:13:48 -05:00
}
2022-11-28 13:08:49 -05:00
.map(|(cont, dir)| {
2022-11-27 17:56:19 -05:00
self.openers
.iter()
2022-11-28 13:08:49 -05:00
.rposition(|(c, _)| *c == cont)
2022-11-27 17:56:19 -05:00
.and_then(|o| {
matches!(dir, Dir::Close | Dir::Both).then(|| {
2022-11-27 17:59:01 -05:00
let (_, e) = &mut self.openers[o];
2022-11-28 13:08:49 -05:00
self.events[*e].kind = EventKind::Enter(cont);
self.openers.drain(o..);
EventKind::Exit(cont)
2022-11-27 17:56:19 -05:00
})
})
.unwrap_or_else(|| {
2022-11-28 13:08:49 -05:00
self.openers.push((cont, self.events.len()));
// use str for now, replace if closed later
2022-12-08 11:42:54 -05:00
EventKind::Str
2022-11-27 17:56:19 -05:00
})
2022-11-26 19:12:56 -05:00
})
2022-11-27 18:10:28 -05:00
.map(|kind| Event {
kind,
span: self.span,
})
2022-11-26 19:12:56 -05:00
}
}
impl<'s> Iterator for Parser<'s> {
type Item = Event;
fn next(&mut self) -> Option<Self::Item> {
2022-12-11 05:12:17 -05:00
let mut ready = true;
2022-11-27 18:34:30 -05:00
while self.events.is_empty()
|| !self.openers.is_empty()
2022-12-11 04:45:05 -05:00
|| !matches!(self.state, State::None)
|| self // for merge
2022-11-28 13:08:49 -05:00
.events
.back()
2022-12-08 11:42:54 -05:00
.map_or(false, |ev| matches!(ev.kind, EventKind::Str))
2022-11-27 18:34:30 -05:00
{
2022-11-27 17:56:19 -05:00
if let Some(ev) = self.parse_event() {
self.events.push_back(ev);
2022-12-11 04:45:05 -05:00
dbg!(&self.events, &self.state);
2022-11-27 17:56:19 -05:00
} else {
2022-12-11 05:12:17 -05:00
ready = false;
2022-11-27 17:56:19 -05:00
break;
}
}
2022-11-26 19:12:56 -05:00
2022-12-11 05:12:17 -05:00
if self.last || ready {
2022-12-11 04:45:05 -05:00
self.events
.pop_front()
.map(|e| {
if matches!(e.kind, EventKind::Str) {
// merge str events
let mut span = e.span;
while self
.events
.front()
.map_or(false, |ev| matches!(ev.kind, EventKind::Str))
{
let ev = self.events.pop_front().unwrap();
assert_eq!(span.end(), ev.span.start());
span = span.union(ev.span);
}
Event {
kind: EventKind::Str,
span,
}
} else {
e
2022-12-08 11:42:54 -05:00
}
2022-12-11 04:45:05 -05:00
})
.or_else(|| {
self.state.verbatim().map(|(kind, _, _)| {
2022-12-11 03:26:55 -05:00
self.state = State::None;
Event {
kind: EventKind::Exit(kind),
span: self.span,
}
2022-12-08 11:42:54 -05:00
})
2022-12-11 04:45:05 -05:00
})
} else {
None
}
2022-11-27 18:34:30 -05:00
}
}
2022-11-21 16:40:11 -05:00
#[cfg(test)]
mod test {
2022-11-26 19:12:56 -05:00
use crate::Span;
2022-11-21 16:40:11 -05:00
use super::Atom::*;
2022-11-22 13:19:21 -05:00
use super::Container::*;
2022-11-27 18:10:28 -05:00
use super::EventKind::*;
2022-12-08 11:42:54 -05:00
use super::Verbatim;
2022-11-21 16:40:11 -05:00
2022-11-22 13:48:17 -05:00
macro_rules! test_parse {
($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => {
#[allow(unused)]
let mut p = super::Parser::new();
2022-12-08 11:42:54 -05:00
p.parse($src, true);
2022-11-28 14:12:49 -05:00
let actual = p.map(|ev| (ev.kind, ev.span.of($src))).collect::<Vec<_>>();
2022-11-22 13:48:17 -05:00
let expected = &[$($($token),*,)?];
assert_eq!(actual, expected, "\n\n{}\n\n", $src);
};
}
#[test]
fn str() {
2022-12-08 11:42:54 -05:00
test_parse!("abc", (Str, "abc"));
test_parse!("abc def", (Str, "abc def"));
2022-11-26 19:12:56 -05:00
}
#[test]
fn verbatim() {
2022-12-08 11:42:54 -05:00
test_parse!(
"`abc`",
(Enter(Verbatim), "`"),
(Str, "abc"),
(Exit(Verbatim), "`"),
);
test_parse!(
"`abc\ndef`",
(Enter(Verbatim), "`"),
(Str, "abc\ndef"),
(Exit(Verbatim), "`"),
);
test_parse!(
"`abc&def`",
(Enter(Verbatim), "`"),
(Str, "abc&def"),
(Exit(Verbatim), "`"),
);
test_parse!(
"`abc",
(Enter(Verbatim), "`"),
(Str, "abc"),
(Exit(Verbatim), ""),
);
test_parse!(
"``abc``",
(Enter(Verbatim), "``"),
(Str, "abc"),
(Exit(Verbatim), "``"),
);
test_parse!(
"abc `def`",
(Str, "abc "),
(Enter(Verbatim), "`"),
(Str, "def"),
(Exit(Verbatim), "`"),
);
test_parse!(
"abc`def`",
(Str, "abc"),
(Enter(Verbatim), "`"),
(Str, "def"),
(Exit(Verbatim), "`"),
);
2022-11-26 19:12:56 -05:00
}
2022-11-27 15:59:54 -05:00
#[test]
fn math() {
2022-12-08 11:42:54 -05:00
test_parse!(
"$`abc`",
(Enter(InlineMath), "$`"),
(Str, "abc"),
(Exit(InlineMath), "`"),
);
test_parse!(
"$`abc` str",
(Enter(InlineMath), "$`"),
(Str, "abc"),
(Exit(InlineMath), "`"),
(Str, " str"),
);
test_parse!(
"$$`abc`",
(Enter(DisplayMath), "$$`"),
(Str, "abc"),
(Exit(DisplayMath), "`"),
);
test_parse!(
"$`abc",
(Enter(InlineMath), "$`"),
(Str, "abc"),
(Exit(InlineMath), ""),
);
test_parse!(
"$```abc```",
(Enter(InlineMath), "$```"),
(Str, "abc"),
(Exit(InlineMath), "```"),
);
2022-11-27 15:59:54 -05:00
}
2022-11-26 19:12:56 -05:00
#[test]
fn container_basic() {
test_parse!(
"_abc_",
2022-11-28 14:12:49 -05:00
(Enter(Emphasis), "_"),
2022-12-08 11:42:54 -05:00
(Str, "abc"),
2022-11-28 14:12:49 -05:00
(Exit(Emphasis), "_"),
2022-11-26 19:12:56 -05:00
);
test_parse!(
"{_abc_}",
2022-11-28 14:12:49 -05:00
(Enter(Emphasis), "{_"),
2022-12-08 11:42:54 -05:00
(Str, "abc"),
2022-11-28 14:12:49 -05:00
(Exit(Emphasis), "_}"),
2022-11-26 19:12:56 -05:00
);
}
#[test]
fn container_nest() {
test_parse!(
"{_{_abc_}_}",
2022-11-28 14:12:49 -05:00
(Enter(Emphasis), "{_"),
(Enter(Emphasis), "{_"),
2022-12-08 11:42:54 -05:00
(Str, "abc"),
2022-11-28 14:12:49 -05:00
(Exit(Emphasis), "_}"),
(Exit(Emphasis), "_}"),
2022-11-26 19:12:56 -05:00
);
test_parse!(
"*_abc_*",
2022-11-28 14:12:49 -05:00
(Enter(Strong), "*"),
(Enter(Emphasis), "_"),
2022-12-08 11:42:54 -05:00
(Str, "abc"),
2022-11-28 14:12:49 -05:00
(Exit(Emphasis), "_"),
(Exit(Strong), "*"),
2022-11-26 19:12:56 -05:00
);
}
#[test]
fn container_unopened() {
2022-12-08 11:42:54 -05:00
test_parse!("*}abc", (Str, "*}abc"));
2022-11-26 19:12:56 -05:00
}
#[test]
fn container_close_parent() {
test_parse!(
"{*{_abc*}",
2022-11-28 14:12:49 -05:00
(Enter(Strong), "{*"),
2022-12-08 11:42:54 -05:00
(Str, "{_abc"),
2022-11-28 14:12:49 -05:00
(Exit(Strong), "*}"),
2022-11-26 19:12:56 -05:00
);
2022-11-22 13:48:17 -05:00
}
2022-11-21 16:40:11 -05:00
#[test]
2022-11-26 19:12:56 -05:00
fn container_close_block() {
2022-12-08 11:42:54 -05:00
test_parse!("{_abc", (Str, "{_abc"));
test_parse!("{_{*{_abc", (Str, "{_{*{_abc"));
2022-11-20 13:13:48 -05:00
}
}