use crate::lex; use crate::Span; use lex::Delimiter; use lex::Symbol; use Atom::*; use Container::*; #[derive(Debug, Clone, PartialEq, Eq)] pub enum Atom { Softbreak, Hardbreak, Escape, Nbsp, Ellipsis, EnDash, EmDash, } #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub enum Container { Span, // typesetting Subscript, Superscript, Insert, Delete, Emphasis, Strong, Mark, // smart quoting SingleQuoted, DoubleQuoted, // Verbatim Verbatim, /// Span is the format. RawFormat, InlineMath, DisplayMath, /// Span is the reference link tag. ReferenceLink, /// Delimiter spans are the URL. InlineLink, AutoLink, } #[derive(Debug, PartialEq, Eq)] pub enum EventKind { Enter(Container), Exit(Container), Atom(Atom), Str, Attributes, } #[derive(Debug, PartialEq, Eq)] pub struct Event { pub kind: EventKind, pub span: Span, } pub struct Parser { /// Lexer, hosting upcoming source. lexer: lex::Lexer, /// Span of current event. span: Span, /// Stack with kind and index of _potential_ openers for typesetting containers. typesets: Vec<(Container, usize)>, /// Stack with index of _potential_ span/link openers. spans: Vec, //attributes: Vec<(Span, usize)>, /// Buffer queue for next events. Events are buffered until no modifications due to future /// characters are needed. events: std::collections::VecDeque, } impl + Clone> Parser { pub fn new(chars: I) -> Self { Self { lexer: lex::Lexer::new(chars), span: Span::new(0, 0), typesets: Vec::new(), spans: Vec::new(), events: std::collections::VecDeque::new(), } } fn eat(&mut self) -> Option { let tok = self.lexer.next(); if let Some(t) = &tok { self.span = self.span.extend(t.len); } tok } fn peek(&mut self) -> Option<&lex::Token> { self.lexer.peek() } fn reset_span(&mut self) { self.span = Span::empty_at(self.span.end()); } fn parse_event(&mut self) -> Option { self.reset_span(); self.eat().map(|first| { self.parse_verbatim(&first) .or_else(|| self.parse_span(&first)) .or_else(|| self.parse_typeset(&first)) .or_else(|| self.parse_atom(&first)) .unwrap_or(Event { kind: EventKind::Str, span: self.span, }) }) } fn parse_verbatim(&mut self, first: &lex::Token) -> Option { match first.kind { lex::Kind::Seq(lex::Sequence::Dollar) => { let math_opt = (first.len <= 2) .then(|| { if let Some(lex::Token { kind: lex::Kind::Seq(lex::Sequence::Backtick), len, }) = self.peek() { Some(( if first.len == 2 { DisplayMath } else { InlineMath }, *len, )) } else { None } }) .flatten(); if math_opt.is_some() { self.eat(); // backticks } math_opt } lex::Kind::Seq(lex::Sequence::Backtick) => Some((Verbatim, first.len)), _ => None, } .map(|(mut kind, opener_len)| { let opener_event = self.events.len(); self.events.push_back(Event { kind: EventKind::Enter(kind), span: self.span, }); let mut span_inner = Span::empty_at(self.span.end()); let mut span_outer = None; while let Some(t) = self.eat() { if matches!(t.kind, lex::Kind::Seq(lex::Sequence::Backtick)) && t.len == opener_len { if matches!(kind, Verbatim) && matches!( self.lexer.peek().map(|t| &t.kind), Some(lex::Kind::Open(Delimiter::BraceEqual)) ) { let mut ahead = self.lexer.inner().clone(); let mut end = false; let len = (&mut ahead) .take_while(|c| { if *c == '{' { return false; } if *c == '}' { end = true; }; !end && !c.is_whitespace() }) .count(); if len > 0 && end { self.lexer = lex::Lexer::new(ahead); let span_format = Span::by_len(self.span.end() + "{=".len(), len); kind = RawFormat; self.events[opener_event].kind = EventKind::Enter(kind); self.events[opener_event].span = span_format; self.span = span_format.translate(1); // } span_outer = Some(span_format); } } break; } span_inner = span_inner.extend(t.len); self.reset_span(); } self.events.push_back(Event { kind: EventKind::Str, span: span_inner, }); Event { kind: EventKind::Exit(kind), span: span_outer.unwrap_or(self.span), } }) } fn parse_span(&mut self, first: &lex::Token) -> Option { match first.kind { lex::Kind::Open(Delimiter::Bracket) => Some(true), lex::Kind::Close(Delimiter::Bracket) => Some(false), _ => None, } .and_then(|open| { if open { self.spans.push(self.events.len()); // use str for now, replace if closed later Some(Event { kind: EventKind::Str, span: self.span, }) } else if !self.spans.is_empty() { let mut ahead = self.lexer.inner().clone(); match ahead.next() { Some(opener @ ('[' | '(')) => { let (closer, kind) = match opener { '[' => (']', ReferenceLink), '(' => (')', InlineLink), _ => unreachable!(), }; let mut end = false; let len = (&mut ahead) .take_while(|c| { if *c == closer { end = true; }; !end && *c != opener }) .count(); end.then(|| { let span = Span::by_len(self.span.end() + 1, len); (kind, span) }) } Some('{') => todo!(), _ => None, } .map(|(kind, span)| { self.lexer = lex::Lexer::new(ahead); let opener_event = self.spans.pop().unwrap(); self.events[opener_event].kind = EventKind::Enter(kind); self.events[opener_event].span = span; self.span = span.translate(1); Event { kind: EventKind::Exit(kind), span, } }) } else { None } }) } fn parse_typeset(&mut self, first: &lex::Token) -> Option { enum Dir { Open, Close, Both, } match first.kind { lex::Kind::Sym(Symbol::Asterisk) => Some((Strong, Dir::Both)), lex::Kind::Sym(Symbol::Underscore) => Some((Emphasis, Dir::Both)), lex::Kind::Sym(Symbol::Caret) => Some((Superscript, Dir::Both)), lex::Kind::Sym(Symbol::Tilde) => Some((Subscript, Dir::Both)), lex::Kind::Sym(Symbol::Quote1) => Some((SingleQuoted, Dir::Both)), lex::Kind::Sym(Symbol::Quote2) => Some((DoubleQuoted, Dir::Both)), lex::Kind::Open(Delimiter::BraceAsterisk) => Some((Strong, Dir::Open)), lex::Kind::Close(Delimiter::BraceAsterisk) => Some((Strong, Dir::Close)), lex::Kind::Open(Delimiter::BraceCaret) => Some((Superscript, Dir::Open)), lex::Kind::Close(Delimiter::BraceCaret) => Some((Superscript, Dir::Close)), lex::Kind::Open(Delimiter::BraceEqual) => Some((Mark, Dir::Open)), lex::Kind::Close(Delimiter::BraceEqual) => Some((Mark, Dir::Close)), lex::Kind::Open(Delimiter::BraceHyphen) => Some((Delete, Dir::Open)), lex::Kind::Close(Delimiter::BraceHyphen) => Some((Delete, Dir::Close)), lex::Kind::Open(Delimiter::BracePlus) => Some((Insert, Dir::Open)), lex::Kind::Close(Delimiter::BracePlus) => Some((Insert, Dir::Close)), lex::Kind::Open(Delimiter::BraceTilde) => Some((Subscript, Dir::Open)), lex::Kind::Close(Delimiter::BraceTilde) => Some((Subscript, Dir::Close)), lex::Kind::Open(Delimiter::BraceUnderscore) => Some((Emphasis, Dir::Open)), lex::Kind::Close(Delimiter::BraceUnderscore) => Some((Emphasis, Dir::Close)), _ => None, } .map(|(cont, dir)| { self.typesets .iter() .rposition(|(c, _)| *c == cont) .and_then(|o| { matches!(dir, Dir::Close | Dir::Both).then(|| { let (_, e) = &mut self.typesets[o]; self.events[*e].kind = EventKind::Enter(cont); self.typesets.drain(o..); EventKind::Exit(cont) }) }) .unwrap_or_else(|| { self.typesets.push((cont, self.events.len())); // use str for now, replace if closed later EventKind::Str }) }) .map(|kind| Event { kind, span: self.span, }) } fn parse_atom(&mut self, first: &lex::Token) -> Option { let atom = match first.kind { lex::Kind::Newline => Softbreak, lex::Kind::Hardbreak => Hardbreak, lex::Kind::Escape => Escape, lex::Kind::Nbsp => Nbsp, lex::Kind::Seq(lex::Sequence::Period) if first.len == 3 => Ellipsis, lex::Kind::Seq(lex::Sequence::Hyphen) if first.len == 2 => EnDash, lex::Kind::Seq(lex::Sequence::Hyphen) if first.len == 3 => EmDash, _ => return None, }; Some(Event { kind: EventKind::Atom(atom), span: self.span, }) } } impl + Clone> Iterator for Parser { type Item = Event; fn next(&mut self) -> Option { while self.events.is_empty() || !self.typesets.is_empty() || !self.spans.is_empty() || self // for merge .events .back() .map_or(false, |ev| matches!(ev.kind, EventKind::Str)) { if let Some(ev) = self.parse_event() { self.events.push_back(ev); } else { break; } } self.events.pop_front().map(|e| { if matches!(e.kind, EventKind::Str) { // merge str events let mut span = e.span; while self .events .front() .map_or(false, |ev| matches!(ev.kind, EventKind::Str)) { let ev = self.events.pop_front().unwrap(); assert_eq!(span.end(), ev.span.start()); span = span.union(ev.span); } Event { kind: EventKind::Str, span, } } else { e } }) } } #[cfg(test)] mod test { use crate::Span; use super::Atom::*; use super::Container::*; use super::EventKind::*; use super::Verbatim; macro_rules! test_parse { ($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => { #[allow(unused)] let mut p = super::Parser::new($src.chars()); let actual = p.map(|ev| (ev.kind, ev.span.of($src))).collect::>(); let expected = &[$($($token),*,)?]; assert_eq!(actual, expected, "\n\n{}\n\n", $src); }; } #[test] fn str() { test_parse!("abc", (Str, "abc")); test_parse!("abc def", (Str, "abc def")); } #[test] fn verbatim() { test_parse!( "`abc`", (Enter(Verbatim), "`"), (Str, "abc"), (Exit(Verbatim), "`"), ); test_parse!( "`abc\ndef`", (Enter(Verbatim), "`"), (Str, "abc\ndef"), (Exit(Verbatim), "`"), ); test_parse!( "`abc&def`", (Enter(Verbatim), "`"), (Str, "abc&def"), (Exit(Verbatim), "`"), ); test_parse!( "`abc", (Enter(Verbatim), "`"), (Str, "abc"), (Exit(Verbatim), ""), ); test_parse!( "``abc``", (Enter(Verbatim), "``"), (Str, "abc"), (Exit(Verbatim), "``"), ); test_parse!( "abc `def`", (Str, "abc "), (Enter(Verbatim), "`"), (Str, "def"), (Exit(Verbatim), "`"), ); test_parse!( "abc`def`", (Str, "abc"), (Enter(Verbatim), "`"), (Str, "def"), (Exit(Verbatim), "`"), ); } #[test] fn math() { test_parse!( "$`abc`", (Enter(InlineMath), "$`"), (Str, "abc"), (Exit(InlineMath), "`"), ); test_parse!( "$`abc` str", (Enter(InlineMath), "$`"), (Str, "abc"), (Exit(InlineMath), "`"), (Str, " str"), ); test_parse!( "$$`abc`", (Enter(DisplayMath), "$$`"), (Str, "abc"), (Exit(DisplayMath), "`"), ); test_parse!( "$`abc", (Enter(InlineMath), "$`"), (Str, "abc"), (Exit(InlineMath), ""), ); test_parse!( "$```abc```", (Enter(InlineMath), "$```"), (Str, "abc"), (Exit(InlineMath), "```"), ); } #[test] fn raw_format() { test_parse!( "`raw`{=format}", (Enter(RawFormat), "format"), (Str, "raw"), (Exit(RawFormat), "format"), ); test_parse!( "before `raw`{=format} after", (Str, "before "), (Enter(RawFormat), "format"), (Str, "raw"), (Exit(RawFormat), "format"), (Str, " after"), ); } #[test] fn raw_attr() { test_parse!( "`raw`{=format #id}", (Enter(Verbatim), "`"), (Str, "raw"), (Exit(Verbatim), "`"), (Str, "{=format #id}"), ); } #[test] fn span_tag() { test_parse!( "[text][tag]", (Enter(ReferenceLink), "tag"), (Str, "text"), (Exit(ReferenceLink), "tag"), ); test_parse!( "before [text][tag] after", (Str, "before "), (Enter(ReferenceLink), "tag"), (Str, "text"), (Exit(ReferenceLink), "tag"), (Str, " after"), ); test_parse!( "[[inner][i]][o]", (Enter(ReferenceLink), "o"), (Enter(ReferenceLink), "i"), (Str, "inner"), (Exit(ReferenceLink), "i"), (Exit(ReferenceLink), "o"), ); } #[test] fn span_url() { test_parse!( "before [text](url) after", (Str, "before "), (Enter(InlineLink), "url"), (Str, "text"), (Exit(InlineLink), "url"), (Str, " after"), ); test_parse!( "[outer [inner](i)](o)", (Enter(InlineLink), "o"), (Str, "outer "), (Enter(InlineLink), "i"), (Str, "inner"), (Exit(InlineLink), "i"), (Exit(InlineLink), "o"), ); } #[test] fn typeset_basic() { test_parse!( "_abc_", (Enter(Emphasis), "_"), (Str, "abc"), (Exit(Emphasis), "_"), ); test_parse!( "{_abc_}", (Enter(Emphasis), "{_"), (Str, "abc"), (Exit(Emphasis), "_}"), ); } #[test] fn typeset_nest() { test_parse!( "{_{_abc_}_}", (Enter(Emphasis), "{_"), (Enter(Emphasis), "{_"), (Str, "abc"), (Exit(Emphasis), "_}"), (Exit(Emphasis), "_}"), ); test_parse!( "*_abc_*", (Enter(Strong), "*"), (Enter(Emphasis), "_"), (Str, "abc"), (Exit(Emphasis), "_"), (Exit(Strong), "*"), ); } #[test] fn typeset_unopened() { test_parse!("*}abc", (Str, "*}abc")); } #[test] fn typeset_close_parent() { test_parse!( "{*{_abc*}", (Enter(Strong), "{*"), (Str, "{_abc"), (Exit(Strong), "*}"), ); } #[test] fn typeset_close_block() { test_parse!("{_abc", (Str, "{_abc")); test_parse!("{_{*{_abc", (Str, "{_{*{_abc")); } }