From 40a612df95198f7a01992509d1f4ed06c376c281 Mon Sep 17 00:00:00 2001 From: Noah Hellman Date: Sat, 12 Nov 2022 18:45:17 +0100 Subject: [PATCH] parse block elements --- Cargo.lock | 7 + Cargo.toml | 15 ++ src/block.rs | 384 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 10 ++ src/main.rs | 10 ++ src/span.rs | 49 +++++++ src/tree.rs | 268 +++++++++++++++++++++++++++++++++++ 7 files changed, 743 insertions(+) create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/block.rs create mode 100644 src/lib.rs create mode 100644 src/main.rs create mode 100644 src/span.rs create mode 100644 src/tree.rs diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..0306aa9 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "jotdown" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..5246238 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "jotdown" +description = "A parser for the Djot markup language" +authors = ["Noah Hellman "] +version = "0.1.0" +license = "MIT" +edition = "2021" +keywords = ["djot", "markup"] +categories = ["parser-implementations"] +homepage = "https://hllmn.net/projects/jotdown" +repository = "https://github.com/hellux/jotdown" +documentation = "https://docs.rs/jotdown" +exclude = [ + "Makefile", +] diff --git a/src/block.rs b/src/block.rs new file mode 100644 index 0000000..696e7a0 --- /dev/null +++ b/src/block.rs @@ -0,0 +1,384 @@ +use crate::Span; +use crate::EOF; + +use crate::tree; + +use Container::*; +use Leaf::*; + +pub type Tree = tree::Tree; + +pub fn parse(src: &str) -> Tree { + Parser::new(src).parse() +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Block { + Leaf(Leaf), + Container(Container), +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Leaf { + Paragraph, + Heading { + level: usize, + }, + Attributes, + Table, + ThematicBreak, + LinkDefinition, + CodeBlock { + fence_char: char, + fence_length: usize, + }, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Container { + Blockquote, + Div { fence_length: usize }, + ListItem { indent: usize }, + Footnote { indent: usize }, +} + +#[derive(Debug, PartialEq, Eq)] +pub enum Atom { + /// Inline content with unparsed inline elements. + Inline, + /// A line with no non-whitespace characters. + Blankline, +} + +struct Parser<'s> { + src: &'s str, + tree: tree::Builder, +} + +impl<'s> Parser<'s> { + #[must_use] + pub fn new(src: &'s str) -> Self { + Self { + src, + tree: tree::Builder::new(), + } + } + + #[must_use] + pub fn parse(mut self) -> Tree { + let mut lines = lines(self.src).collect::>(); + let mut line_pos = 0; + loop { + let line_count = self.parse_block(&mut lines[line_pos..]); + if line_count == 0 { + break; + } + line_pos += line_count; + } + self.tree.finish() + } + + /// Recursively parse a block and all of its children. Return number of lines the block uses. + fn parse_block(&mut self, lines: &mut [Span]) -> usize { + let blanklines = lines + .iter() + .take_while(|sp| sp.of(self.src).trim().is_empty()) + .map(|sp| { + self.tree.elem(Atom::Blankline, *sp); + }) + .count(); + let lines = &mut lines[blanklines..]; + Block::parse(lines.iter().map(|sp| (sp.of(self.src), sp.start()))).map_or( + 0, + |(kind, span, len)| { + match &kind { + Block::Leaf(_) => { + self.tree.enter(kind, span); + lines[0] = lines[0].with_start(span.end()); + for line in lines.iter().take(len) { + self.tree.elem(Atom::Inline, *line); + } + } + Block::Container(c) => { + let (skip_chars, skip_lines_suffix) = match &c { + Blockquote => (1, 0), + ListItem { indent } | Footnote { indent } => (*indent, 0), + Div { .. } => (0, 1), + }; + let line_count = lines.len() - skip_lines_suffix; + + // update spans, remove indentation / container prefix + lines[0] = lines[0].with_start(span.end()); + lines.iter_mut().skip(1).take(line_count).for_each(|sp| { + let skip = (sp + .of(self.src) + .chars() + .take_while(|c| c.is_whitespace()) + .count() + + skip_chars) + .min(sp.len()); + *sp = sp.trim_start(skip); + }); + + self.tree.enter(kind, span); + let mut l = 0; + while l < line_count { + l += self.parse_block(&mut lines[l..line_count]); + } + } + } + self.tree.exit(); + blanklines + len + }, + ) + } +} + +impl Block { + /// Parse a single block. Return number of lines the block uses. + fn parse<'b, I: Iterator>( + mut lines: I, + ) -> Option<(Block, Span, usize)> { + if let Some((l, start)) = lines.next() { + let (kind, sp) = Block::start(l); + let line_count = 1 + lines.take_while(|(l, _)| kind.continues(l)).count(); + Some((kind, sp.translate(start), line_count)) + } else { + None + } + } + + /// Determine what type of block a line can start. + fn start(line: &str) -> (Block, Span) { + let start = line.chars().take_while(|c| c.is_whitespace()).count(); + let line = &line[start..]; + let mut chars = line.chars(); + match chars.next().unwrap_or(EOF) { + '#' => chars + .find(|c| *c != '#') + .map_or(true, char::is_whitespace) + .then(|| { + let span = Span::by_len(start, line.len() - chars.as_str().len() - 1); + (Self::Leaf(Heading { level: span.len() }), span) + }), + '>' => chars.next().map_or(true, |c| c == ' ').then(|| { + ( + Self::Container(Blockquote), + Span::by_len(start, line.len() - chars.as_str().len() - 1), + ) + }), + f @ ':' => { + let fence_length = chars.take_while(|c| *c == f).count() + 1; + (fence_length >= 3).then(|| { + ( + Self::Container(Div { fence_length }), + Span::by_len(start, line.len()), + ) + }) + } + fence_char @ ('`' | '~') => { + let fence_length = chars.take_while(|c| *c == fence_char).count() + 1; + (fence_length >= 3).then(|| { + ( + Self::Leaf(CodeBlock { + fence_char, + fence_length, + }), + Span::by_len(start, line.len()), + ) + }) + } + _ => { + let thematic_break = || { + let mut without_whitespace = line.chars().filter(|c| !c.is_whitespace()); + let length = without_whitespace.clone().count(); + (length >= 3 + && (without_whitespace.clone().all(|c| c == '-') + || without_whitespace.all(|c| c == '*'))) + .then(|| (Self::Leaf(ThematicBreak), Span::by_len(start, line.len()))) + }; + thematic_break() + } + } + .unwrap_or((Self::Leaf(Paragraph), Span::new(0, 0))) + } + + /// Determine if this line continues a block of a certain type. + fn continues(&self, line: &str) -> bool { + match self { + Self::Leaf(Paragraph | Heading { .. } | Table | LinkDefinition) => { + !line.trim().is_empty() + } + Self::Leaf(Attributes | ThematicBreak) => false, + Self::Leaf(CodeBlock { + fence_char, + fence_length, + }) => !line.chars().take(*fence_length).all(|c| c == *fence_char), + Self::Container(Blockquote) => line.trim().starts_with('>'), + Self::Container(Footnote { indent } | ListItem { indent }) => { + let spaces = line.chars().take_while(|c| c.is_whitespace()).count(); + !line.trim().is_empty() && spaces >= *indent + } + Self::Container(Div { fence_length }) => { + line.chars().take(*fence_length).all(|c| c == ':') + } + } + } +} + +impl std::fmt::Display for Block { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Block::Leaf(e) => std::fmt::Debug::fmt(e, f), + Block::Container(c) => std::fmt::Debug::fmt(c, f), + } + } +} + +impl std::fmt::Display for Atom { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Inline") + } +} + +/// Similar to `std::str::split('\n')` but newline is included and spans are used instead of `str`. +fn lines(src: &str) -> impl Iterator + '_ { + let mut chars = src.chars(); + std::iter::from_fn(move || { + if chars.as_str().is_empty() { + None + } else { + let start = src.len() - chars.as_str().len(); + chars.find(|c| *c == '\n'); + let end = src.len() - chars.as_str().len(); + if start == end { + None + } else { + Some(Span::new(start, end)) + } + } + }) +} + +#[cfg(test)] +mod test { + use crate::tree::Event; + use crate::Span; + + use super::Atom::*; + use super::Block; + use super::Block::*; + use super::Container::*; + use super::Leaf::*; + + #[test] + fn parse_elem_oneline() { + let src = "para\n"; + + assert_eq!( + super::Parser::new(src).parse().iter().collect::>(), + &[ + Event::Enter(&Leaf(Paragraph), Span::new(0, 0)), + Event::Element(&Inline, Span::new(0, 5)), + Event::Exit, + ], + ); + } + + #[test] + fn parse_elem_multiline() { + let src = "para\npara\n"; + + assert_eq!( + super::Parser::new(src).parse().iter().collect::>(), + &[ + Event::Enter(&Leaf(Paragraph), Span::new(0, 0)), + Event::Element(&Inline, Span::new(0, 5)), + Event::Element(&Inline, Span::new(5, 10)), + Event::Exit, + ], + ); + } + + #[test] + fn parse_elem_multi() { + let src = concat!( + "# 2\n", + "\n", + " # 8\n", + " 12\n", + "15\n", // + ); + + assert_eq!( + super::Parser::new(src).parse().iter().collect::>(), + &[ + Event::Enter(&Leaf(Heading { level: 1 }), Span::new(0, 1)), + Event::Element(&Inline, Span::new(1, 4)), + Event::Exit, + Event::Element(&Blankline, Span::new(4, 5)), + Event::Enter(&Leaf(Heading { level: 1 }), Span::new(6, 7)), + Event::Element(&Inline, Span::new(7, 10)), + Event::Element(&Inline, Span::new(10, 15)), + Event::Element(&Inline, Span::new(15, 18)), + Event::Exit, + ], + ); + } + + #[test] + fn parse_container() { + let src = concat!( + "> a\n", + ">\n", + "> ## hl\n", + ">\n", + "> para\n", // + ); + + assert_eq!( + super::Parser::new(src).parse().iter().collect::>(), + &[ + Event::Enter(&Container(Blockquote), Span::new(0, 1)), + Event::Enter(&Leaf(Paragraph), Span::new(1, 1)), + Event::Element(&Inline, Span::new(1, 4)), + Event::Exit, + Event::Element(&Blankline, Span::new(5, 6)), + Event::Enter(&Leaf(Heading { level: 2 }), Span::new(8, 10)), + Event::Element(&Inline, Span::new(10, 14)), + Event::Exit, + Event::Element(&Blankline, Span::new(15, 16)), + Event::Enter(&Leaf(Paragraph), Span::new(17, 17)), + Event::Element(&Inline, Span::new(17, 23)), + Event::Exit, + Event::Exit, + ] + ); + } + + #[test] + fn block_multiline() { + let src = "# heading\n spanning two lines\n"; + let lines = super::lines(src).map(|sp| (sp.of(src), sp.start())); + let (kind, sp, len) = Block::parse(lines).unwrap(); + assert_eq!(kind, Block::Leaf(Heading { level: 1 })); + assert_eq!(sp.of(src), "#"); + assert_eq!(len, 2); + } + + #[test] + fn block_container() { + let src = concat!( + "> a\n", + ">\n", + " > b\n", + ">\n", + "> c\n", // + ); + let lines = super::lines(src).map(|sp| (sp.of(src), sp.start())); + let (kind, sp, len) = Block::parse(lines).unwrap(); + assert_eq!(kind, Block::Container(Blockquote)); + assert_eq!(sp.of(src), ">"); + assert_eq!(len, 5); + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..37102fb --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,10 @@ +mod block; +mod span; +mod tree; + +pub use block::parse; +pub use block::Tree; + +const EOF: char = '\0'; + +use span::Span; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..4131af4 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,10 @@ +use std::io::Read; + +fn main() { + let mut src = String::new(); + std::io::stdin() + .read_to_string(&mut src) + .expect("failed to read unicode file"); + + print!("{}", jotdown::parse(&src)); +} diff --git a/src/span.rs b/src/span.rs new file mode 100644 index 0000000..80ee4ee --- /dev/null +++ b/src/span.rs @@ -0,0 +1,49 @@ +#[derive(Clone, Copy, Default, Debug, PartialEq, Eq)] +pub struct Span { + start: u32, + end: u32, +} + +impl Span { + pub fn new(start: usize, end: usize) -> Self { + Self::by_len(start, end.checked_sub(start).unwrap()) + } + + pub fn by_len(start: usize, len: usize) -> Self { + Self { + start: start.try_into().unwrap(), + end: start.checked_add(len).unwrap().try_into().unwrap(), + } + } + + pub fn with_start(self, start: usize) -> Self { + Self::new(start, self.end()) + } + + pub fn trim_start(self, n: usize) -> Self { + Self::new(self.start().checked_add(n).unwrap(), self.end()) + } + + pub fn translate(self, n: usize) -> Self { + Self::new( + self.start().checked_add(n).unwrap(), + self.end().checked_add(n).unwrap(), + ) + } + + pub fn start(self) -> usize { + self.start.try_into().unwrap() + } + + pub fn end(self) -> usize { + self.end.try_into().unwrap() + } + + pub fn len(self) -> usize { + self.end() - self.start() + } + + pub fn of(self, s: &str) -> &str { + &s[self.start()..self.end()] + } +} diff --git a/src/tree.rs b/src/tree.rs new file mode 100644 index 0000000..7c06efb --- /dev/null +++ b/src/tree.rs @@ -0,0 +1,268 @@ +use crate::Span; + +#[derive(Debug)] +pub struct Tree { + nodes: Vec>, +} + +impl Tree { + fn new(nodes: Vec>) -> Self { + Self { nodes } + } + + pub fn iter(&self) -> Iter { + self.into() + } +} + +#[derive(Debug, PartialEq, Eq)] +pub enum Event<'a, C, E> { + Enter(&'a C, Span), + Element(&'a E, Span), + Exit, +} + +pub struct Iter<'a, C, E> { + nodes: &'a [Node], + branch: Vec, + head: Option, +} + +impl<'a, C, E> Iterator for Iter<'a, C, E> { + type Item = Event<'a, C, E>; + + fn next(&mut self) -> Option { + if let Some(head) = self.head { + let n = &self.nodes[head.index()]; + match &n.kind { + NodeKind::Root => { + self.head = n.next; + self.next() + } + NodeKind::Container(c, child) => { + self.branch.push(head); + self.head = *child; + Some(Event::Enter(c, n.span)) + } + NodeKind::Element(e) => { + self.head = n.next; + Some(Event::Element(e, n.span)) + } + } + } else if let Some(block_ni) = self.branch.pop() { + let Node { next, .. } = &self.nodes[block_ni.index()]; + self.head = *next; + Some(Event::Exit) + } else { + None + } + } +} + +impl<'a, C, E> From<&'a Tree> for Iter<'a, C, E> { + fn from(tree: &'a Tree) -> Self { + Self { + nodes: &tree.nodes, + branch: Vec::new(), + head: Some(NodeIndex::root()), + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +struct NodeIndex(std::num::NonZeroUsize); + +impl NodeIndex { + fn new(i: usize) -> Self { + assert_ne!(i, usize::MAX); + Self((i + 1).try_into().unwrap()) + } + + fn root() -> Self { + Self::new(0) + } + + fn index(self) -> usize { + usize::from(self.0) - 1 + } +} + +#[derive(Debug, Clone)] +enum NodeKind { + Root, + Container(C, Option), + Element(E), +} + +#[derive(Debug, Clone)] +struct Node { + span: Span, + kind: NodeKind, + next: Option, +} + +#[derive(Debug, Clone)] +pub struct Builder { + nodes: Vec>, + branch: Vec, + head: Option, +} + +impl Builder { + pub(super) fn new() -> Self { + Builder { + nodes: vec![Node { + span: Span::default(), + kind: NodeKind::Root, + next: None, + }], + branch: vec![], + head: Some(NodeIndex::root()), + } + } + + pub(super) fn elem(&mut self, e: E, span: Span) { + self.add_node(Node { + span, + kind: NodeKind::Element(e), + next: None, + }); + } + + pub(super) fn enter(&mut self, c: C, span: Span) { + self.add_node(Node { + span, + kind: NodeKind::Container(c, None), + next: None, + }); + } + + pub(super) fn exit(&mut self) { + if self.head.is_some() { + self.head = None; + } else { + let last = self.branch.pop(); + assert_ne!(last, None); + } + } + + pub(super) fn finish(self) -> Tree { + Tree::new(self.nodes) + } + + fn add_node(&mut self, node: Node) { + let ni = NodeIndex::new(self.nodes.len()); + self.nodes.push(node); + if let Some(head_ni) = &mut self.head { + let mut head = &mut self.nodes[head_ni.index()]; + match &mut head.kind { + NodeKind::Root | NodeKind::Element(_) => { + // update next pointer of previous node + assert_eq!(head.next, None); + head.next = Some(ni); + } + NodeKind::Container(_, child) => { + self.branch.push(*head_ni); + // update child pointer of current container + assert_eq!(*child, None); + *child = Some(ni); + } + } + } else if let Some(block) = self.branch.pop() { + let mut block = &mut self.nodes[block.index()]; + assert!(matches!(block.kind, NodeKind::Container(..))); + block.next = Some(ni); + } else { + panic!() + } + self.head = Some(ni); + } +} + +impl std::fmt::Display + for Builder +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.clone().finish().fmt(f) + } +} + +impl std::fmt::Display for Tree { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + const INDENT: &str = " "; + let mut level = 0; + for e in self.iter() { + let indent = INDENT.repeat(level); + match e { + Event::Enter(container, sp) => { + writeln!(f, "{}{} ({}:{})", indent, container, sp.start(), sp.end())?; + level += 1; + } + Event::Exit => level -= 1, + Event::Element(element, sp) => { + writeln!(f, "{}{} ({}:{})", indent, element, sp.start(), sp.end())?; + } + } + } + Ok(()) + } +} + +#[cfg(test)] +mod test { + use crate::Span; + + #[test] + fn fmt_linear() { + let mut tree: super::Builder = super::Builder::new(); + tree.elem(1, Span::new(0, 1)); + tree.elem(2, Span::new(1, 2)); + tree.elem(3, Span::new(3, 4)); + assert_eq!( + tree.to_string(), + concat!( + "1 (0:1)\n", + "2 (1:2)\n", + "3 (3:4)\n", // + ) + ); + } + + #[test] + fn fmt_container() { + let mut tree: super::Builder = super::Builder::new(); + tree.enter(1, Span::new(0, 1)); + tree.elem(11, Span::new(0, 1)); + tree.elem(12, Span::new(0, 1)); + tree.exit(); + tree.enter(2, Span::new(1, 5)); + tree.enter(21, Span::new(2, 5)); + tree.enter(211, Span::new(3, 4)); + tree.elem(2111, Span::new(3, 4)); + tree.exit(); + tree.exit(); + tree.enter(22, Span::new(4, 5)); + tree.elem(221, Span::new(4, 5)); + tree.exit(); + tree.exit(); + tree.enter(3, Span::new(5, 6)); + tree.elem(31, Span::new(5, 6)); + tree.exit(); + assert_eq!( + tree.to_string(), + concat!( + "1 (0:1)\n", + " 11 (0:1)\n", + " 12 (0:1)\n", + "2 (1:5)\n", + " 21 (2:5)\n", + " 211 (3:4)\n", + " 2111 (3:4)\n", + " 22 (4:5)\n", + " 221 (4:5)\n", + "3 (5:6)\n", + " 31 (5:6)\n", + ) + ); + } +}