diff --git a/src/block.rs b/src/block.rs index 696e7a0..1602b58 100644 --- a/src/block.rs +++ b/src/block.rs @@ -7,6 +7,7 @@ use Container::*; use Leaf::*; pub type Tree = tree::Tree; +pub type TreeIter<'t> = tree::Iter<'t, Block, Atom>; pub fn parse(src: &str) -> Tree { Parser::new(src).parse() diff --git a/src/inline.rs b/src/inline.rs index fe85421..29d129f 100644 --- a/src/inline.rs +++ b/src/inline.rs @@ -1,40 +1,43 @@ use crate::Span; use crate::tree; +use crate::CowStr; use Atom::*; use Container::*; -pub type Tree = tree::Tree; +pub type Tree<'s> = tree::Tree>; -pub fn parse>(src: &str, inlines: I) -> Tree { +/* +pub fn parse<'s, I: Iterator>(src: &'s str, inlines: I) -> Vec> { Parser::new(src).parse(inlines) } +*/ -pub enum Inline { - Atom(Atom), +pub enum Inline<'s> { + Atom(Atom<'s>), Container(Container), } #[derive(Debug, Clone, PartialEq, Eq)] -pub enum Atom { +pub enum Atom<'s> { Str, Softbreak, Hardbreak, Escape, - Nbsp, - FootnoteReference, - ExplicitLink, - ReferenceLink, - Emoji, - OpenMarker, - Ellipses, - ImageMarker, - EmDash, - RawFormat, + Nbsp, // ?? + OpenMarker, // ?? + Ellipses, // ?? + ImageMarker, // ?? + EmDash, // ?? + FootnoteReference { label: CowStr<'s> }, + ExplicitLink { label: CowStr<'s> }, + ReferenceLink { label: CowStr<'s> }, + Emoji { name: CowStr<'s> }, + RawFormat { format: CowStr<'s> }, } -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] pub enum Container { // attributes Attributes, @@ -63,25 +66,148 @@ pub enum Container { Destination, } -pub struct Event; +#[derive(Debug)] +pub enum Event<'s> { + Start(Container, OpenerState), + End(Container), + Atom(Atom<'s>), +} -pub struct Parser<'s> { - src: &'s str, +#[derive(Debug)] +pub enum OpenerState { + Unclosed, + Closed, + Discarded, +} + +#[derive(Debug)] +pub enum ContainerType { + Opener, + Closer, + Both, +} + +pub struct Parser<'s, I: Iterator> { + chars: std::iter::Peekable, openers: Vec<(Container, usize)>, - events: Vec<(Event, Span)>, + events: Vec>, //tree: tree::Builder, } -impl<'s> Parser<'s> { - fn new(src: &'s str) -> Self { +impl<'s, I: Iterator> Parser<'s, I> { + pub fn new(chars: I) -> Self { Self { - src, + chars: chars.peekable(), openers: Vec::new(), events: Vec::new(), } } - fn parse>(mut self, inlines: I) -> Tree { + /* + fn step(&mut self) -> lex::Token { + let token = lex::Lexer::new(&self.src[self.pos..]).next_token(); + self.pos += token.len; + std::mem::replace(&mut self.next_token, token) + } + + fn eat(&mut self) -> lex::TokenKind { + loop { + let end = self.pos; + let token = self.step(); + if !matches!(token.kind, lex::TokenKind::Whitespace) { + self.span = Span::new(end - token.len, end); + return token.kind; + } + } + } + + fn peek(&mut self) -> &lex::TokenKind { + if matches!(self.next_token.kind, lex::TokenKind::Whitespace) { + let _whitespace = self.step(); + } + &self.next_token.kind + } + */ + + pub fn parse(mut self) -> Vec<(Event<'s>, u32)> { + let mut len = 0; + + while let Some(c) = self.chars.peek() { + //let start = self.pos(); + + let cont = match c { + '*' => Some((Strong, ContainerType::Both)), + '_' => Some((Emph, ContainerType::Both)), + '^' => Some((Superscript, ContainerType::Both)), + '~' => Some((Subscript, ContainerType::Both)), + '\'' => Some((SingleQuoted, ContainerType::Both)), + '"' => Some((DoubleQuoted, ContainerType::Both)), + '`' => todo!(), + '{' => todo!(), + '$' => todo!(), + '<' => todo!(), + '[' => todo!(), + _ => None, + }; + + let ev = cont + .and_then(|(cont, ty)| { + self.openers + .iter() + .rposition(|(c, _)| *c == cont) + .map(|i| { + if let Event::Start(c, state) = &mut self.events[i] { + assert_eq!(*c, cont); + if matches!(ty, ContainerType::Closer | ContainerType::Both) { + *state = OpenerState::Closed; + Some(Event::End(cont)) + } else if matches!(ty, ContainerType::Opener | ContainerType::Both) + { + *state = OpenerState::Discarded; + Some(Event::Start(cont, OpenerState::Unclosed)) + } else { + None + } + } else { + unreachable!() + } + }) + .unwrap_or_else(|| { + matches!(ty, ContainerType::Opener | ContainerType::Both).then(|| { + self.openers.push((cont, self.events.len())); + Event::Start(cont, OpenerState::Unclosed) + }) + }) + }) + .unwrap_or(Event::Atom(Str)); + + self.events.push(ev); + } + //self.events todo!() } } + +/* +impl<'s> Iterator for Parser<'s> { + type Item = (Event<'s>, Span); + + fn next(&mut self) -> Option { + self.chars.next().map(|c| { + match c { + '*' => todo!(), + '_' => todo!(), + '^' => todo!(), + '~' => todo!(), + '\'' => todo!(), + '"' => todo!(), + '$' => todo!(), + '<' => todo!(), + '{' => todo!(), + '[' => todo!(), + _ => + } + }) + } +} +*/ diff --git a/src/lex.rs b/src/lex.rs new file mode 100644 index 0000000..fab6c60 --- /dev/null +++ b/src/lex.rs @@ -0,0 +1,358 @@ +use crate::EOF; + +use Delimiter::*; +use Sequence::*; +use Symbol::*; +use TokenKind::*; + +#[derive(Debug)] +pub(crate) struct Token { + pub kind: TokenKind, + pub len: usize, +} + +#[derive(Debug, PartialEq, Eq)] +pub enum TokenKind { + Text, + Whitespace, + Nbsp, + Escape, + Integer, + Open(Delimiter), + Close(Delimiter), + Sym(Symbol), + Seq(Sequence), +} + +#[derive(Debug, PartialEq, Eq)] +pub enum Delimiter { + Brace, + BraceAsterisk, + BraceCaret, + BraceEqual, + BraceHyphen, + BracePlus, + BraceTilde, + BraceUnderscore, + Bracket, + Paren, +} + +#[derive(Debug, PartialEq, Eq)] +pub enum Symbol { + Asterisk, + Caret, + Dollar1, + Dollar2, + Equal, + Exclaim, + Gt, + Lt, + Percentage, + Pipe, + Plus, + Quote1, + Quote2, + Tilde, + Underscore, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Sequence { + Backtick, + Colon, + Hash, + Hyphen, + Period, +} + +impl Sequence { + fn ch(self) -> char { + match self { + Self::Backtick => '`', + Self::Colon => ':', + Self::Hash => '#', + Self::Period => '.', + Self::Hyphen => '-', + } + } +} + +pub(crate) struct Lexer> { + chars: std::iter::Peekable, + escape: bool, + next: Option, + len: usize, +} + +impl> Lexer { + pub fn new(chars: I) -> Lexer { + Lexer { + chars: chars.peekable(), + escape: false, + next: None, + len: 0, + } + } + + fn peek(&mut self) -> char { + self.chars.peek().copied().unwrap_or(EOF) + } + + fn eat(&mut self) -> Option { + let c = self.chars.next(); + self.len += c.map_or(0, char::len_utf8); + c + } + + fn len(&self) -> usize { + self.len + } + + fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + while predicate(self.peek()) { + self.eat(); + } + } + + fn token(&mut self) -> Option { + let first = self.eat()?; + + let escape = self.escape; + + let kind = match first { + _ if escape && first == ' ' => Nbsp, + _ if escape => Text, + + '\\' => { + let next = self.peek(); + if next == ' ' || next.is_ascii_punctuation() { + self.escape = true; + Escape + } else { + Text + } + } + + _ if first.is_whitespace() => { + self.eat_while(char::is_whitespace); + Whitespace + } + + '(' => Open(Paren), + ')' => Close(Paren), + '[' => Open(Bracket), + ']' => Close(Bracket), + '{' => { + let explicit = match self.peek() { + '*' => Some(Open(BraceAsterisk)), + '^' => Some(Open(BraceCaret)), + '=' => Some(Open(BraceEqual)), + '-' => Some(Open(BraceHyphen)), + '+' => Some(Open(BracePlus)), + '~' => Some(Open(BraceTilde)), + '_' => Some(Open(BraceUnderscore)), + _ => None, + }; + if let Some(exp) = explicit { + self.eat(); + exp + } else { + Open(Brace) + } + } + '*' => self.maybe_eat_close_brace(Asterisk, BraceAsterisk), + '^' => self.maybe_eat_close_brace(Caret, BraceCaret), + '=' => self.maybe_eat_close_brace(Equal, BraceEqual), + '+' => self.maybe_eat_close_brace(Plus, BracePlus), + '~' => self.maybe_eat_close_brace(Tilde, BraceTilde), + '_' => self.maybe_eat_close_brace(Underscore, BraceUnderscore), + '-' => { + if self.peek() == '}' { + self.eat(); + Close(BraceHyphen) + } else { + self.eat_seq(Hyphen) + } + } + + '$' => { + if self.peek() == '$' { + self.eat(); + Sym(Dollar2) + } else { + Sym(Dollar1) + } + } + '!' => Sym(Exclaim), + '%' => Sym(Percentage), + '<' => Sym(Lt), + '>' => Sym(Gt), + '|' => Sym(Pipe), + '\'' => Sym(Quote1), + '"' => Sym(Quote2), + + '`' => self.eat_seq(Backtick), + ':' => self.eat_seq(Colon), + '#' => self.eat_seq(Hash), + '.' => self.eat_seq(Period), + + '0'..='9' => { + self.eat_while(|c| c.is_ascii_digit()); + Integer + } + + _ => Text, + }; + + if escape { + self.escape = false; + } + + let len = self.len(); + + Some(Token { kind, len }) + } + + fn eat_seq(&mut self, s: Sequence) -> TokenKind { + self.eat_while(|c| c == s.ch()); + Seq(s) + } + + fn maybe_eat_close_brace(&mut self, s: Symbol, d: Delimiter) -> TokenKind { + if self.peek() == '}' { + self.eat(); + Close(d) + } else { + Sym(s) + } + } +} + +impl> Iterator for Lexer { + type Item = Token; + + fn next(&mut self) -> Option { + if let Some(token) = self.next.take() { + Some(token) + } else { + let mut current = self.token(); + + // concatenate text tokens + if let Some(Token { kind: Text, len }) = &mut current { + self.next = self.token(); + while let Some(Token { kind: Text, len: l }) = self.next { + *len += l; + self.next = self.token(); + } + } + + current + } + } +} + +#[cfg(test)] +mod test { + use super::Delimiter::*; + use super::Sequence::*; + use super::Symbol::*; + use super::TokenKind::*; + + macro_rules! test_lex { + ($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => { + #[allow(unused)] + let actual = super::Lexer::new($src.chars()).map(|t| t.kind).collect::>(); + let expected = vec![$($($token),*,)?]; + assert_eq!(actual, expected, "{}", $src); + }; + } + + #[test] + fn empty() { + test_lex!(""); + } + + #[test] + fn basic() { + test_lex!("abc", Text); + test_lex!( + "para w/ some _emphasis_ and *strong*.", + Text, + Whitespace, + Text, + Whitespace, + Text, + Whitespace, + Sym(Underscore), + Text, + Sym(Underscore), + Whitespace, + Text, + Whitespace, + Sym(Asterisk), + Text, + Sym(Asterisk), + Seq(Period) + ); + } + + #[test] + fn escape() { + test_lex!(r#"\a"#, Text); + test_lex!(r#"\\a"#, Escape, Text); + test_lex!(r#"\."#, Escape, Text); + test_lex!(r#"\ "#, Escape, Nbsp); + test_lex!(r#"\{-"#, Escape, Text, Seq(Hyphen)); + } + + #[test] + fn delim() { + test_lex!("{-", Open(BraceHyphen)); + test_lex!("-}", Close(BraceHyphen)); + test_lex!("{++}", Open(BracePlus), Close(BracePlus)); + } + + #[test] + fn sym() { + test_lex!( + r#"'*^=!><%|+"~_"#, + Sym(Quote1), + Sym(Asterisk), + Sym(Caret), + Sym(Equal), + Sym(Exclaim), + Sym(Gt), + Sym(Lt), + Sym(Percentage), + Sym(Pipe), + Sym(Plus), + Sym(Quote2), + Sym(Tilde), + Sym(Underscore), + ); + test_lex!("''''", Sym(Quote1), Sym(Quote1), Sym(Quote1), Sym(Quote1),); + } + + #[test] + fn seq() { + test_lex!("`", Seq(Backtick)); + test_lex!("```", Seq(Backtick)); + test_lex!( + "`:#-.", + Seq(Backtick), + Seq(Colon), + Seq(Hash), + Seq(Hyphen), + Seq(Period), + ); + } + + #[test] + fn int() { + test_lex!("1", Integer); + test_lex!("123", Integer); + test_lex!("1234567890", Integer); + test_lex!("000", Integer); + } +} diff --git a/src/lib.rs b/src/lib.rs index 20f4aec..362062c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,11 +1,110 @@ mod block; mod inline; +mod lex; mod span; mod tree; -pub use block::parse; -pub use block::Tree; +use inline::Atom; +use inline::Container as InlineTag; + +pub struct Block; const EOF: char = '\0'; +type CowStr<'s> = std::borrow::Cow<'s, str>; + +/* +pub enum Tag<'s> { + Paragraph, + Heading { level: u8 }, + BlockQuote, + CodeBlock { info_string: CowStr<'s> }, + List { start_index: Option }, + ListItem, + FootnoteDefinition { label: CowStr<'s> }, + Table, + Image {}, + Link {}, + Block(Block), + Inline(InlineTag), +} + +pub struct Attributes; // TODO + +pub enum Event<'s> { + Start(Tag<'s>, Attributes), + End(Tag<'s>), + Atom(Atom<'s>), +} +*/ + use span::Span; + +pub struct Parser<'s> { + src: &'s str, + tree: block::Tree, +} + +impl<'s> Parser<'s> { + pub fn new(src: &'s str) -> Self { + Self { + src, + tree: block::parse(src), + } + } + + pub fn parse(&mut self) {} + + pub fn iter(&self) -> Iter { + Iter { + src: self.src, + tree: self.tree.iter().peekable(), + } + } +} + +pub struct Iter<'s> { + src: &'s str, + tree: std::iter::Peekable>, +} + +impl<'s> Iterator for Iter<'s> { + type Item = String; + + fn next(&mut self) -> Option { + self.tree.next().map(|ev| match ev { + tree::Event::Enter(block::Block::Container(cont), _sp) => { + format!("cont {:?}", cont) + } + tree::Event::Enter(block::Block::Leaf(leaf), _sp) => { + // concatenate all inlines + let chars = (&mut self.tree) + .take_while(|ev| matches!(ev, tree::Event::Element(..))) + .flat_map(|ev| ev.span().of(self.src).chars()); + let evs = inline::Parser::new(chars).parse(); + /* + let chars = std::iter::from_fn(|| { + let mut eat = false; + let ret = if let Some(tree::Event::Element(_a, sp)) = self.tree.peek() { + eat = true; + let chars = sp.of(self.src).chars(); + Some(chars) + } else { + None + }; + if eat { + self.tree.next(); + } + ret + }) + .flatten(); + */ + format!("leaf {:?} {:?}", leaf, evs) + } + tree::Event::Element(atom, _sp) => { + format!("atom {:?}", atom) + } + tree::Event::Exit => "exit".to_string(), + }) + } +} diff --git a/src/main.rs b/src/main.rs index 4131af4..c6c2334 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,5 +6,7 @@ fn main() { .read_to_string(&mut src) .expect("failed to read unicode file"); - print!("{}", jotdown::parse(&src)); + let p = jotdown::Parser::new(&src); + let v = p.iter().collect::>(); + print!("{:?}", v); } diff --git a/src/tree.rs b/src/tree.rs index 7c06efb..088fb5c 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -22,6 +22,15 @@ pub enum Event<'a, C, E> { Exit, } +impl<'a, C, E> Event<'a, C, E> { + pub fn span(&self) -> Span { + match self { + Self::Enter(_, sp) | Self::Element(_, sp) => *sp, + Self::Exit => panic!(), + } + } +} + pub struct Iter<'a, C, E> { nodes: &'a [Node], branch: Vec,