diff --git a/src/inline.rs b/src/inline.rs index c51b9fa..4bc2ca5 100644 --- a/src/inline.rs +++ b/src/inline.rs @@ -6,10 +6,10 @@ use lex::Symbol; use Atom::*; use Container::*; +use NodeKind::*; #[derive(Debug, Clone, PartialEq, Eq)] pub enum Atom { - Str, Softbreak, Hardbreak, Escape, @@ -19,15 +19,31 @@ pub enum Atom { ImageMarker, // ?? EmDash, EnDash, + Lt, + Gt, + Ampersand, + Quote, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Node { + pub kind: NodeKind, + pub span: Span, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum NodeKind { + Str, + // link FootnoteReference, - Link, ReferenceLink, + Link, Emoji, + // verbatim + Verbatim, RawFormat, - // math DisplayMath, InlineMath, - Verbatim, } #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -60,6 +76,7 @@ pub enum Event { Enter(Container), Exit(Container), Atom(Atom), + Node(Node), } #[derive(Debug, Clone, Copy)] @@ -71,21 +88,148 @@ pub enum Dir { pub struct Parser<'s> { openers: Vec, - events: Vec, - lexer: Option>>, + close_containers: Option, + next: Option, + span: Span, + + lexer: std::iter::Peekable>, } impl<'s> Parser<'s> { pub fn new() -> Self { Self { openers: Vec::new(), - events: Vec::new(), - lexer: None, + close_containers: None, + next: None, + span: Span::new(0, 0), + + lexer: lex::Lexer::new("").peekable(), } } pub fn parse(&mut self, src: &'s str) { - self.lexer = Some(lex::Lexer::new(src).peekable()); + self.lexer = lex::Lexer::new(src).peekable(); + } + + fn eat(&mut self) -> Option { + let tok = self.lexer.next(); + if let Some(t) = &tok { + self.span = self.span.extend(t.len); + } + tok + } + + fn peek(&mut self) -> Option<&lex::Token> { + self.lexer.peek() + } + + fn reset_span(&mut self) { + self.span = Span::empty_at(self.span.end()); + } + + fn node(&self, kind: NodeKind) -> Event { + Event::Node(Node { + span: self.span, + kind, + }) + } + + fn parse_event(&mut self) -> Option { + self.reset_span(); + self.eat().map(|first| { + self.parse_verbatim(&first) + .or_else(|| self.parse_container(&first)) + .or_else(|| self.parse_atom(&first)) + .unwrap_or_else(|| self.node(Str)) + }) + } + + fn parse_atom(&mut self, first: &lex::Token) -> Option { + match first.kind { + lex::Kind::Escape => Some(Event::Atom(Escape)), + lex::Kind::Nbsp => Some(Event::Atom(Nbsp)), + lex::Kind::Sym(lex::Symbol::Lt) => Some(Event::Atom(Lt)), + lex::Kind::Sym(lex::Symbol::Gt) => Some(Event::Atom(Gt)), + lex::Kind::Sym(lex::Symbol::Quote2) => Some(Event::Atom(Quote)), + _ => None, + } + } + + fn parse_verbatim(&mut self, first: &lex::Token) -> Option { + match first.kind { + lex::Kind::Seq(lex::Sequence::Dollar) => { + let math_opt = (first.len <= 2) + .then(|| { + if let Some(lex::Token { + kind: lex::Kind::Seq(lex::Sequence::Backtick), + len, + }) = self.peek() + { + Some((DisplayMath, first.len)) + } else { + None + } + }) + .flatten(); + if math_opt.is_some() { + self.eat(); // backticks + } + math_opt + } + lex::Kind::Seq(lex::Sequence::Backtick) => Some((Verbatim, first.len)), + _ => None, + } + .map(|(kind, opener_len)| { + let mut span = Span::empty_at(self.span.end()); + while let Some(tok) = self.eat() { + if matches!(tok.kind, lex::Kind::Seq(lex::Sequence::Backtick)) + && tok.len == opener_len + { + break; + } + span = span.extend(tok.len); + } + Event::Node(Node { kind, span }) + }) + } + + fn parse_container(&mut self, first: &lex::Token) -> Option { + match first.kind { + lex::Kind::Sym(Symbol::Asterisk) => Some((Strong, Dir::Both)), + lex::Kind::Sym(Symbol::Underscore) => Some((Emphasis, Dir::Both)), + lex::Kind::Sym(Symbol::Caret) => Some((Superscript, Dir::Both)), + lex::Kind::Sym(Symbol::Tilde) => Some((Subscript, Dir::Both)), + lex::Kind::Sym(Symbol::Quote1) => Some((SingleQuoted, Dir::Both)), + lex::Kind::Sym(Symbol::Quote2) => Some((DoubleQuoted, Dir::Both)), + lex::Kind::Open(Delimiter::Bracket) => Some((LinkText, Dir::Open)), + lex::Kind::Close(Delimiter::Bracket) => Some((LinkText, Dir::Close)), + lex::Kind::Open(Delimiter::BraceAsterisk) => Some((Strong, Dir::Open)), + lex::Kind::Close(Delimiter::BraceAsterisk) => Some((Strong, Dir::Close)), + lex::Kind::Open(Delimiter::BraceCaret) => Some((Superscript, Dir::Open)), + lex::Kind::Close(Delimiter::BraceCaret) => Some((Superscript, Dir::Close)), + lex::Kind::Open(Delimiter::BraceEqual) => Some((Mark, Dir::Open)), + lex::Kind::Close(Delimiter::BraceEqual) => Some((Mark, Dir::Close)), + lex::Kind::Open(Delimiter::BraceHyphen) => Some((Delete, Dir::Open)), + lex::Kind::Close(Delimiter::BraceHyphen) => Some((Delete, Dir::Close)), + lex::Kind::Open(Delimiter::BracePlus) => Some((Insert, Dir::Open)), + lex::Kind::Close(Delimiter::BracePlus) => Some((Insert, Dir::Close)), + lex::Kind::Open(Delimiter::BraceTilde) => Some((Subscript, Dir::Open)), + lex::Kind::Close(Delimiter::BraceTilde) => Some((Subscript, Dir::Close)), + lex::Kind::Open(Delimiter::BraceUnderscore) => Some((Emphasis, Dir::Open)), + lex::Kind::Close(Delimiter::BraceUnderscore) => Some((Emphasis, Dir::Close)), + _ => None, + } + .and_then(|(cont, dir)| { + if matches!(dir, Dir::Close | Dir::Both) && self.openers.contains(&cont) { + self.close_containers = self.openers.iter().rposition(|o| *o == cont); + Some(Event::Exit(self.openers.pop().unwrap())) + } else if matches!(dir, Dir::Open | Dir::Both) { + self.openers.push(cont); + Some(Event::Enter(cont)) + } else { + None + } + }) } } @@ -93,142 +237,42 @@ impl<'s> Iterator for Parser<'s> { type Item = Event; fn next(&mut self) -> Option { - if self.events.is_empty() { - if let Some(lexer) = &mut self.lexer { - Parse::new(lexer, &mut self.openers, &mut self.events).parse(); - } - } - - self.events.pop() - } -} - -struct Parse<'l, 's, 'e> { - tokens: &'l mut std::iter::Peekable>, - openers: &'e mut Vec, - events: &'e mut Vec, -} - -impl<'l, 's, 'e> Parse<'l, 's, 'e> { - fn new( - tokens: &'l mut std::iter::Peekable>, - openers: &'e mut Vec, - events: &'e mut Vec, - ) -> Self { - Self { - tokens, - openers, - events, - } - } - - fn peek(&mut self) -> Option<&lex::Kind> { - self.tokens.peek().map(|t| &t.kind) - } - - fn parse(&mut self) { - let mut t = if let Some(t) = self.tokens.next() { - t - } else { - return; - }; - - { - let verbatim_opt = match t.kind { - lex::Kind::Seq(lex::Sequence::Dollar) => { - let math_opt = (t.len <= 2) - .then(|| { - if let Some(lex::Kind::Seq(lex::Sequence::Backtick)) = self.peek() { - Some((DisplayMath, t.len)) - } else { - None - } - }) - .flatten(); - if math_opt.is_some() { - self.tokens.next(); // backticks + self.next.take().or_else(|| { + self.close_containers + .and_then(|i| { + if i < self.openers.len() { + Some(Event::Exit(self.openers.pop().unwrap())) + } else { + self.close_containers = None; + None } - math_opt - } - lex::Kind::Seq(lex::Sequence::Backtick) => Some((Verbatim, t.len)), - _ => None, - }; + }) + .or_else(|| { + let mut current = self.parse_event(); - if let Some((atom, opener_len)) = verbatim_opt { - for tok in &mut self.tokens { - if matches!(tok.kind, lex::Kind::Seq(lex::Sequence::Backtick)) - && tok.len == opener_len - { - self.events.push(Event::Atom(atom)); - return; - } - } - } - } - - { - let container_opt = match t.kind { - lex::Kind::Sym(Symbol::Asterisk) => Some((Strong, Dir::Both)), - lex::Kind::Sym(Symbol::Underscore) => Some((Emphasis, Dir::Both)), - lex::Kind::Sym(Symbol::Caret) => Some((Superscript, Dir::Both)), - lex::Kind::Sym(Symbol::Tilde) => Some((Subscript, Dir::Both)), - lex::Kind::Sym(Symbol::Quote1) => Some((SingleQuoted, Dir::Both)), - lex::Kind::Sym(Symbol::Quote2) => Some((DoubleQuoted, Dir::Both)), - lex::Kind::Open(Delimiter::Bracket) => Some((LinkText, Dir::Open)), - lex::Kind::Open(Delimiter::BraceAsterisk) => Some((Strong, Dir::Open)), - lex::Kind::Open(Delimiter::BraceCaret) => Some((Superscript, Dir::Open)), - lex::Kind::Open(Delimiter::BraceEqual) => Some((Mark, Dir::Open)), - lex::Kind::Open(Delimiter::BraceHyphen) => Some((Delete, Dir::Open)), - lex::Kind::Open(Delimiter::BracePlus) => Some((Insert, Dir::Open)), - lex::Kind::Open(Delimiter::BraceTilde) => Some((Subscript, Dir::Open)), - lex::Kind::Open(Delimiter::BraceUnderscore) => Some((Emphasis, Dir::Open)), - lex::Kind::Close(Delimiter::Bracket) => Some((LinkText, Dir::Close)), - lex::Kind::Close(Delimiter::BraceAsterisk) => Some((Strong, Dir::Close)), - lex::Kind::Close(Delimiter::BraceCaret) => Some((Superscript, Dir::Close)), - lex::Kind::Close(Delimiter::BraceEqual) => Some((Mark, Dir::Close)), - lex::Kind::Close(Delimiter::BraceHyphen) => Some((Delete, Dir::Close)), - lex::Kind::Close(Delimiter::BracePlus) => Some((Insert, Dir::Close)), - lex::Kind::Close(Delimiter::BraceTilde) => Some((Subscript, Dir::Close)), - lex::Kind::Close(Delimiter::BraceUnderscore) => Some((Emphasis, Dir::Close)), - _ => None, - }; - - if let Some((cont, dir)) = container_opt { - if matches!(dir, Dir::Close | Dir::Both) && self.openers.contains(&cont) { - loop { - let c = self.openers.pop().unwrap(); - self.events.push(Event::Exit(c)); - if c == cont { - break; + if let Some(Event::Node(Node { kind: Str, span })) = &mut current { + self.next = self.parse_event(); + while let Some(Event::Node(Node { kind: Str, span: s })) = self.next { + *span = span.union(s); + self.next = self.parse_event(); } } - return; - } else if matches!(dir, Dir::Open | Dir::Both) { - self.openers.push(cont); - self.events.push(Event::Enter(cont)); - return; - } - } - } - { - if let lex::Kind::Open(Delimiter::Brace) = t.kind { - todo!(); // check for attr - } - } - - if let Some(Event::Atom(Str)) = self.events.last() { - } else { - self.events.push(Event::Atom(Str)); - } + current + }) + .or_else(|| self.openers.pop().map(Event::Exit)) + }) } } #[cfg(test)] mod test { + use crate::Span; + use super::Atom::*; use super::Container::*; use super::Event::*; + use super::NodeKind::*; macro_rules! test_parse { ($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => { @@ -241,14 +285,99 @@ mod test { }; } - #[test] - fn str() { - test_parse!("abc", Atom(Str)); - test_parse!("abc def", Atom(Str)); + impl super::NodeKind { + pub fn span(self, start: usize, end: usize) -> super::Node { + super::Node { + span: Span::new(start, end), + kind: self, + } + } } #[test] - fn container_brace() { - test_parse!("{_abc_}", Enter(Emphasis), Atom(Str), Exit(Emphasis)); + fn str() { + test_parse!("abc", Node(Str.span(0, 3))); + test_parse!("abc def", Node(Str.span(0, 7))); + } + + #[test] + fn verbatim() { + test_parse!("`abc`", Node(Verbatim.span(1, 4))); + test_parse!("`abc", Node(Verbatim.span(1, 4))); + test_parse!("``abc``", Node(Verbatim.span(2, 5))); + test_parse!("abc `def`", Node(Str.span(0, 4)), Node(Verbatim.span(5, 8))); + } + + #[test] + fn container_basic() { + test_parse!( + "_abc_", + Enter(Emphasis), + Node(Str.span(1, 4)), + Exit(Emphasis) + ); + test_parse!( + "{_abc_}", + Enter(Emphasis), + Node(Str.span(2, 5)), + Exit(Emphasis) + ); + } + + #[test] + fn container_nest() { + test_parse!( + "{_{_abc_}_}", + Enter(Emphasis), + Enter(Emphasis), + Node(Str.span(4, 7)), + Exit(Emphasis), + Exit(Emphasis) + ); + test_parse!( + "*_abc_*", + Enter(Strong), + Enter(Emphasis), + Node(Str.span(2, 5)), + Exit(Emphasis), + Exit(Strong) + ); + } + + #[test] + fn container_unopened() { + test_parse!("*}abc", Node(Str.span(0, 5)),); + } + + #[test] + fn container_close_parent() { + test_parse!( + "{*{_abc*}", + Enter(Strong), + Enter(Emphasis), + Node(Str.span(4, 7)), + Exit(Emphasis), + Exit(Strong) + ); + } + + #[test] + fn container_close_block() { + test_parse!( + "{_abc", + Enter(Emphasis), + Node(Str.span(2, 5)), + Exit(Emphasis) + ); + test_parse!( + "{_{*{_abc", + Enter(Emphasis), + Enter(Strong), + Enter(Emphasis), + Node(Str.span(6, 9)), + Exit(Emphasis), + Exit(Strong), + Exit(Emphasis), + ); } } diff --git a/src/lex.rs b/src/lex.rs index d690bf9..d6fcb17 100644 --- a/src/lex.rs +++ b/src/lex.rs @@ -229,9 +229,7 @@ impl<'s> Iterator for Lexer<'s> { type Item = Token; fn next(&mut self) -> Option { - if let Some(token) = self.next.take() { - Some(token) - } else { + self.next.take().or_else(|| { let mut current = self.token(); // concatenate text tokens @@ -244,7 +242,7 @@ impl<'s> Iterator for Lexer<'s> { } current - } + }) } } diff --git a/src/lib.rs b/src/lib.rs index a7c6393..9936d27 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,6 +30,7 @@ impl<'s> Parser<'s> { src: self.src, tree: self.tree.iter(), parser: None, + inline_start: 0, } } } @@ -46,6 +47,7 @@ pub struct Iter<'s> { src: &'s str, tree: block::TreeIter<'s>, parser: Option>, + inline_start: usize, } impl<'s> Iterator for Iter<'s> { @@ -54,13 +56,17 @@ impl<'s> Iterator for Iter<'s> { fn next(&mut self) -> Option { while let Some(parser) = &mut self.parser { // inside leaf block, with inline content - if let Some(inline) = parser.next() { + if let Some(mut inline) = parser.next() { + if let inline::Event::Node(inline::Node { span, .. }) = &mut inline { + *span = span.translate(self.inline_start); + } return Some(Event::Inline(inline)); } else if let Some(ev) = self.tree.next() { match ev { tree::Event::Element(atom, sp) => { assert_eq!(*atom, block::Atom::Inline); parser.parse(sp.of(self.src)); + self.inline_start = sp.start(); } tree::Event::Exit => { self.parser = None; @@ -76,11 +82,10 @@ impl<'s> Iterator for Iter<'s> { assert_eq!(*atom, block::Atom::Blankline); Event::Blankline } - tree::Event::Enter(block @ block::Block::Container(..), ..) => { - Event::Start(block.clone()) - } - tree::Event::Enter(block @ block::Block::Leaf(..), ..) => { - self.parser = Some(inline::Parser::new()); + tree::Event::Enter(block, ..) => { + if matches!(block, block::Block::Leaf(..)) { + self.parser = Some(inline::Parser::new()); + } Event::Start(block.clone()) } tree::Event::Exit => Event::End, @@ -96,6 +101,7 @@ mod test { use crate::block::Leaf::*; use crate::inline::Atom::*; use crate::inline::Event::*; + use crate::inline::NodeKind::*; macro_rules! test_parse { ($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => { @@ -108,15 +114,26 @@ mod test { #[test] fn para() { - test_parse!("abc", Start(Leaf(Paragraph)), Inline(Atom(Str)), End); - test_parse!("abc def", Start(Leaf(Paragraph)), Inline(Atom(Str)), End); test_parse!( - "this is a paragraph\n\nfollowed by another one", + "para", Start(Leaf(Paragraph)), - Inline(Atom(Str)), + Inline(Node(Str.span(0, 4))), + End + ); + test_parse!( + "pa ra", + Start(Leaf(Paragraph)), + Inline(Node(Str.span(0, 9))), + End + ); + test_parse!( + "para0\n\npara1", + Start(Leaf(Paragraph)), + Inline(Node(Str.span(0, 6))), End, + Blankline, Start(Leaf(Paragraph)), - Inline(Atom(Str)), + Inline(Node(Str.span(7, 12))), End, ); } diff --git a/src/span.rs b/src/span.rs index 80ee4ee..35deb5a 100644 --- a/src/span.rs +++ b/src/span.rs @@ -16,6 +16,10 @@ impl Span { } } + pub fn empty_at(start: usize) -> Self { + Self::by_len(start, 0) + } + pub fn with_start(self, start: usize) -> Self { Self::new(start, self.end()) } @@ -31,6 +35,14 @@ impl Span { ) } + pub fn extend(self, n: usize) -> Self { + Self::new(self.start(), self.end() + n) + } + + pub fn union(self, span: Self) -> Self { + Self::new(self.start(), span.end()) + } + pub fn start(self) -> usize { self.start.try_into().unwrap() }