This commit is contained in:
Noah Hellman 2022-11-20 19:13:48 +01:00
parent 5aa6d337ff
commit fe45519ca9
6 changed files with 622 additions and 27 deletions

View file

@ -7,6 +7,7 @@ use Container::*;
use Leaf::*; use Leaf::*;
pub type Tree = tree::Tree<Block, Atom>; pub type Tree = tree::Tree<Block, Atom>;
pub type TreeIter<'t> = tree::Iter<'t, Block, Atom>;
pub fn parse(src: &str) -> Tree { pub fn parse(src: &str) -> Tree {
Parser::new(src).parse() Parser::new(src).parse()

View file

@ -1,40 +1,43 @@
use crate::Span; use crate::Span;
use crate::tree; use crate::tree;
use crate::CowStr;
use Atom::*; use Atom::*;
use Container::*; use Container::*;
pub type Tree = tree::Tree<Container, Atom>; pub type Tree<'s> = tree::Tree<Container, Atom<'s>>;
pub fn parse<I: Iterator<Item = Span>>(src: &str, inlines: I) -> Tree { /*
pub fn parse<'s, I: Iterator<Item = Span>>(src: &'s str, inlines: I) -> Vec<Event<'s>> {
Parser::new(src).parse(inlines) Parser::new(src).parse(inlines)
} }
*/
pub enum Inline { pub enum Inline<'s> {
Atom(Atom), Atom(Atom<'s>),
Container(Container), Container(Container),
} }
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub enum Atom { pub enum Atom<'s> {
Str, Str,
Softbreak, Softbreak,
Hardbreak, Hardbreak,
Escape, Escape,
Nbsp, Nbsp, // ??
FootnoteReference, OpenMarker, // ??
ExplicitLink, Ellipses, // ??
ReferenceLink, ImageMarker, // ??
Emoji, EmDash, // ??
OpenMarker, FootnoteReference { label: CowStr<'s> },
Ellipses, ExplicitLink { label: CowStr<'s> },
ImageMarker, ReferenceLink { label: CowStr<'s> },
EmDash, Emoji { name: CowStr<'s> },
RawFormat, RawFormat { format: CowStr<'s> },
} }
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub enum Container { pub enum Container {
// attributes // attributes
Attributes, Attributes,
@ -63,25 +66,148 @@ pub enum Container {
Destination, Destination,
} }
pub struct Event; #[derive(Debug)]
pub enum Event<'s> {
Start(Container, OpenerState),
End(Container),
Atom(Atom<'s>),
}
pub struct Parser<'s> { #[derive(Debug)]
src: &'s str, pub enum OpenerState {
Unclosed,
Closed,
Discarded,
}
#[derive(Debug)]
pub enum ContainerType {
Opener,
Closer,
Both,
}
pub struct Parser<'s, I: Iterator<Item = char>> {
chars: std::iter::Peekable<I>,
openers: Vec<(Container, usize)>, openers: Vec<(Container, usize)>,
events: Vec<(Event, Span)>, events: Vec<Event<'s>>,
//tree: tree::Builder<Container, Atom>, //tree: tree::Builder<Container, Atom>,
} }
impl<'s> Parser<'s> { impl<'s, I: Iterator<Item = char>> Parser<'s, I> {
fn new(src: &'s str) -> Self { pub fn new(chars: I) -> Self {
Self { Self {
src, chars: chars.peekable(),
openers: Vec::new(), openers: Vec::new(),
events: Vec::new(), events: Vec::new(),
} }
} }
fn parse<I: Iterator<Item = Span>>(mut self, inlines: I) -> Tree { /*
fn step(&mut self) -> lex::Token {
let token = lex::Lexer::new(&self.src[self.pos..]).next_token();
self.pos += token.len;
std::mem::replace(&mut self.next_token, token)
}
fn eat(&mut self) -> lex::TokenKind {
loop {
let end = self.pos;
let token = self.step();
if !matches!(token.kind, lex::TokenKind::Whitespace) {
self.span = Span::new(end - token.len, end);
return token.kind;
}
}
}
fn peek(&mut self) -> &lex::TokenKind {
if matches!(self.next_token.kind, lex::TokenKind::Whitespace) {
let _whitespace = self.step();
}
&self.next_token.kind
}
*/
pub fn parse(mut self) -> Vec<(Event<'s>, u32)> {
let mut len = 0;
while let Some(c) = self.chars.peek() {
//let start = self.pos();
let cont = match c {
'*' => Some((Strong, ContainerType::Both)),
'_' => Some((Emph, ContainerType::Both)),
'^' => Some((Superscript, ContainerType::Both)),
'~' => Some((Subscript, ContainerType::Both)),
'\'' => Some((SingleQuoted, ContainerType::Both)),
'"' => Some((DoubleQuoted, ContainerType::Both)),
'`' => todo!(),
'{' => todo!(),
'$' => todo!(),
'<' => todo!(),
'[' => todo!(),
_ => None,
};
let ev = cont
.and_then(|(cont, ty)| {
self.openers
.iter()
.rposition(|(c, _)| *c == cont)
.map(|i| {
if let Event::Start(c, state) = &mut self.events[i] {
assert_eq!(*c, cont);
if matches!(ty, ContainerType::Closer | ContainerType::Both) {
*state = OpenerState::Closed;
Some(Event::End(cont))
} else if matches!(ty, ContainerType::Opener | ContainerType::Both)
{
*state = OpenerState::Discarded;
Some(Event::Start(cont, OpenerState::Unclosed))
} else {
None
}
} else {
unreachable!()
}
})
.unwrap_or_else(|| {
matches!(ty, ContainerType::Opener | ContainerType::Both).then(|| {
self.openers.push((cont, self.events.len()));
Event::Start(cont, OpenerState::Unclosed)
})
})
})
.unwrap_or(Event::Atom(Str));
self.events.push(ev);
}
//self.events
todo!() todo!()
} }
} }
/*
impl<'s> Iterator for Parser<'s> {
type Item = (Event<'s>, Span);
fn next(&mut self) -> Option<Self::Item> {
self.chars.next().map(|c| {
match c {
'*' => todo!(),
'_' => todo!(),
'^' => todo!(),
'~' => todo!(),
'\'' => todo!(),
'"' => todo!(),
'$' => todo!(),
'<' => todo!(),
'{' => todo!(),
'[' => todo!(),
_ =>
}
})
}
}
*/

358
src/lex.rs Normal file
View file

@ -0,0 +1,358 @@
use crate::EOF;
use Delimiter::*;
use Sequence::*;
use Symbol::*;
use TokenKind::*;
#[derive(Debug)]
pub(crate) struct Token {
pub kind: TokenKind,
pub len: usize,
}
#[derive(Debug, PartialEq, Eq)]
pub enum TokenKind {
Text,
Whitespace,
Nbsp,
Escape,
Integer,
Open(Delimiter),
Close(Delimiter),
Sym(Symbol),
Seq(Sequence),
}
#[derive(Debug, PartialEq, Eq)]
pub enum Delimiter {
Brace,
BraceAsterisk,
BraceCaret,
BraceEqual,
BraceHyphen,
BracePlus,
BraceTilde,
BraceUnderscore,
Bracket,
Paren,
}
#[derive(Debug, PartialEq, Eq)]
pub enum Symbol {
Asterisk,
Caret,
Dollar1,
Dollar2,
Equal,
Exclaim,
Gt,
Lt,
Percentage,
Pipe,
Plus,
Quote1,
Quote2,
Tilde,
Underscore,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Sequence {
Backtick,
Colon,
Hash,
Hyphen,
Period,
}
impl Sequence {
fn ch(self) -> char {
match self {
Self::Backtick => '`',
Self::Colon => ':',
Self::Hash => '#',
Self::Period => '.',
Self::Hyphen => '-',
}
}
}
pub(crate) struct Lexer<I: Iterator<Item = char>> {
chars: std::iter::Peekable<I>,
escape: bool,
next: Option<Token>,
len: usize,
}
impl<I: Iterator<Item = char>> Lexer<I> {
pub fn new(chars: I) -> Lexer<I> {
Lexer {
chars: chars.peekable(),
escape: false,
next: None,
len: 0,
}
}
fn peek(&mut self) -> char {
self.chars.peek().copied().unwrap_or(EOF)
}
fn eat(&mut self) -> Option<char> {
let c = self.chars.next();
self.len += c.map_or(0, char::len_utf8);
c
}
fn len(&self) -> usize {
self.len
}
fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
while predicate(self.peek()) {
self.eat();
}
}
fn token(&mut self) -> Option<Token> {
let first = self.eat()?;
let escape = self.escape;
let kind = match first {
_ if escape && first == ' ' => Nbsp,
_ if escape => Text,
'\\' => {
let next = self.peek();
if next == ' ' || next.is_ascii_punctuation() {
self.escape = true;
Escape
} else {
Text
}
}
_ if first.is_whitespace() => {
self.eat_while(char::is_whitespace);
Whitespace
}
'(' => Open(Paren),
')' => Close(Paren),
'[' => Open(Bracket),
']' => Close(Bracket),
'{' => {
let explicit = match self.peek() {
'*' => Some(Open(BraceAsterisk)),
'^' => Some(Open(BraceCaret)),
'=' => Some(Open(BraceEqual)),
'-' => Some(Open(BraceHyphen)),
'+' => Some(Open(BracePlus)),
'~' => Some(Open(BraceTilde)),
'_' => Some(Open(BraceUnderscore)),
_ => None,
};
if let Some(exp) = explicit {
self.eat();
exp
} else {
Open(Brace)
}
}
'*' => self.maybe_eat_close_brace(Asterisk, BraceAsterisk),
'^' => self.maybe_eat_close_brace(Caret, BraceCaret),
'=' => self.maybe_eat_close_brace(Equal, BraceEqual),
'+' => self.maybe_eat_close_brace(Plus, BracePlus),
'~' => self.maybe_eat_close_brace(Tilde, BraceTilde),
'_' => self.maybe_eat_close_brace(Underscore, BraceUnderscore),
'-' => {
if self.peek() == '}' {
self.eat();
Close(BraceHyphen)
} else {
self.eat_seq(Hyphen)
}
}
'$' => {
if self.peek() == '$' {
self.eat();
Sym(Dollar2)
} else {
Sym(Dollar1)
}
}
'!' => Sym(Exclaim),
'%' => Sym(Percentage),
'<' => Sym(Lt),
'>' => Sym(Gt),
'|' => Sym(Pipe),
'\'' => Sym(Quote1),
'"' => Sym(Quote2),
'`' => self.eat_seq(Backtick),
':' => self.eat_seq(Colon),
'#' => self.eat_seq(Hash),
'.' => self.eat_seq(Period),
'0'..='9' => {
self.eat_while(|c| c.is_ascii_digit());
Integer
}
_ => Text,
};
if escape {
self.escape = false;
}
let len = self.len();
Some(Token { kind, len })
}
fn eat_seq(&mut self, s: Sequence) -> TokenKind {
self.eat_while(|c| c == s.ch());
Seq(s)
}
fn maybe_eat_close_brace(&mut self, s: Symbol, d: Delimiter) -> TokenKind {
if self.peek() == '}' {
self.eat();
Close(d)
} else {
Sym(s)
}
}
}
impl<I: Iterator<Item = char>> Iterator for Lexer<I> {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
if let Some(token) = self.next.take() {
Some(token)
} else {
let mut current = self.token();
// concatenate text tokens
if let Some(Token { kind: Text, len }) = &mut current {
self.next = self.token();
while let Some(Token { kind: Text, len: l }) = self.next {
*len += l;
self.next = self.token();
}
}
current
}
}
}
#[cfg(test)]
mod test {
use super::Delimiter::*;
use super::Sequence::*;
use super::Symbol::*;
use super::TokenKind::*;
macro_rules! test_lex {
($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => {
#[allow(unused)]
let actual = super::Lexer::new($src.chars()).map(|t| t.kind).collect::<Vec<_>>();
let expected = vec![$($($token),*,)?];
assert_eq!(actual, expected, "{}", $src);
};
}
#[test]
fn empty() {
test_lex!("");
}
#[test]
fn basic() {
test_lex!("abc", Text);
test_lex!(
"para w/ some _emphasis_ and *strong*.",
Text,
Whitespace,
Text,
Whitespace,
Text,
Whitespace,
Sym(Underscore),
Text,
Sym(Underscore),
Whitespace,
Text,
Whitespace,
Sym(Asterisk),
Text,
Sym(Asterisk),
Seq(Period)
);
}
#[test]
fn escape() {
test_lex!(r#"\a"#, Text);
test_lex!(r#"\\a"#, Escape, Text);
test_lex!(r#"\."#, Escape, Text);
test_lex!(r#"\ "#, Escape, Nbsp);
test_lex!(r#"\{-"#, Escape, Text, Seq(Hyphen));
}
#[test]
fn delim() {
test_lex!("{-", Open(BraceHyphen));
test_lex!("-}", Close(BraceHyphen));
test_lex!("{++}", Open(BracePlus), Close(BracePlus));
}
#[test]
fn sym() {
test_lex!(
r#"'*^=!><%|+"~_"#,
Sym(Quote1),
Sym(Asterisk),
Sym(Caret),
Sym(Equal),
Sym(Exclaim),
Sym(Gt),
Sym(Lt),
Sym(Percentage),
Sym(Pipe),
Sym(Plus),
Sym(Quote2),
Sym(Tilde),
Sym(Underscore),
);
test_lex!("''''", Sym(Quote1), Sym(Quote1), Sym(Quote1), Sym(Quote1),);
}
#[test]
fn seq() {
test_lex!("`", Seq(Backtick));
test_lex!("```", Seq(Backtick));
test_lex!(
"`:#-.",
Seq(Backtick),
Seq(Colon),
Seq(Hash),
Seq(Hyphen),
Seq(Period),
);
}
#[test]
fn int() {
test_lex!("1", Integer);
test_lex!("123", Integer);
test_lex!("1234567890", Integer);
test_lex!("000", Integer);
}
}

View file

@ -1,11 +1,110 @@
mod block; mod block;
mod inline; mod inline;
mod lex;
mod span; mod span;
mod tree; mod tree;
pub use block::parse; use inline::Atom;
pub use block::Tree; use inline::Container as InlineTag;
pub struct Block;
const EOF: char = '\0'; const EOF: char = '\0';
type CowStr<'s> = std::borrow::Cow<'s, str>;
/*
pub enum Tag<'s> {
Paragraph,
Heading { level: u8 },
BlockQuote,
CodeBlock { info_string: CowStr<'s> },
List { start_index: Option<u64> },
ListItem,
FootnoteDefinition { label: CowStr<'s> },
Table,
Image {},
Link {},
Block(Block),
Inline(InlineTag),
}
pub struct Attributes; // TODO
pub enum Event<'s> {
Start(Tag<'s>, Attributes),
End(Tag<'s>),
Atom(Atom<'s>),
}
*/
use span::Span; use span::Span;
pub struct Parser<'s> {
src: &'s str,
tree: block::Tree,
}
impl<'s> Parser<'s> {
pub fn new(src: &'s str) -> Self {
Self {
src,
tree: block::parse(src),
}
}
pub fn parse(&mut self) {}
pub fn iter(&self) -> Iter {
Iter {
src: self.src,
tree: self.tree.iter().peekable(),
}
}
}
pub struct Iter<'s> {
src: &'s str,
tree: std::iter::Peekable<block::TreeIter<'s>>,
}
impl<'s> Iterator for Iter<'s> {
type Item = String;
fn next(&mut self) -> Option<Self::Item> {
self.tree.next().map(|ev| match ev {
tree::Event::Enter(block::Block::Container(cont), _sp) => {
format!("cont {:?}", cont)
}
tree::Event::Enter(block::Block::Leaf(leaf), _sp) => {
// concatenate all inlines
let chars = (&mut self.tree)
.take_while(|ev| matches!(ev, tree::Event::Element(..)))
.flat_map(|ev| ev.span().of(self.src).chars());
let evs = inline::Parser::new(chars).parse();
/*
let chars = std::iter::from_fn(|| {
let mut eat = false;
let ret = if let Some(tree::Event::Element(_a, sp)) = self.tree.peek() {
eat = true;
let chars = sp.of(self.src).chars();
Some(chars)
} else {
None
};
if eat {
self.tree.next();
}
ret
})
.flatten();
*/
format!("leaf {:?} {:?}", leaf, evs)
}
tree::Event::Element(atom, _sp) => {
format!("atom {:?}", atom)
}
tree::Event::Exit => "exit".to_string(),
})
}
}

View file

@ -6,5 +6,7 @@ fn main() {
.read_to_string(&mut src) .read_to_string(&mut src)
.expect("failed to read unicode file"); .expect("failed to read unicode file");
print!("{}", jotdown::parse(&src)); let p = jotdown::Parser::new(&src);
let v = p.iter().collect::<Vec<_>>();
print!("{:?}", v);
} }

View file

@ -22,6 +22,15 @@ pub enum Event<'a, C, E> {
Exit, Exit,
} }
impl<'a, C, E> Event<'a, C, E> {
pub fn span(&self) -> Span {
match self {
Self::Enter(_, sp) | Self::Element(_, sp) => *sp,
Self::Exit => panic!(),
}
}
}
pub struct Iter<'a, C, E> { pub struct Iter<'a, C, E> {
nodes: &'a [Node<C, E>], nodes: &'a [Node<C, E>],
branch: Vec<NodeIndex>, branch: Vec<NodeIndex>,