parse block elements

This commit is contained in:
Noah Hellman 2022-11-12 18:45:17 +01:00
commit 40a612df95
7 changed files with 743 additions and 0 deletions

7
Cargo.lock generated Normal file
View file

@ -0,0 +1,7 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "jotdown"
version = "0.1.0"

15
Cargo.toml Normal file
View file

@ -0,0 +1,15 @@
[package]
name = "jotdown"
description = "A parser for the Djot markup language"
authors = ["Noah Hellman <noah@hllmn.net>"]
version = "0.1.0"
license = "MIT"
edition = "2021"
keywords = ["djot", "markup"]
categories = ["parser-implementations"]
homepage = "https://hllmn.net/projects/jotdown"
repository = "https://github.com/hellux/jotdown"
documentation = "https://docs.rs/jotdown"
exclude = [
"Makefile",
]

384
src/block.rs Normal file
View file

@ -0,0 +1,384 @@
use crate::Span;
use crate::EOF;
use crate::tree;
use Container::*;
use Leaf::*;
pub type Tree = tree::Tree<Block, Atom>;
pub fn parse(src: &str) -> Tree {
Parser::new(src).parse()
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Block {
Leaf(Leaf),
Container(Container),
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Leaf {
Paragraph,
Heading {
level: usize,
},
Attributes,
Table,
ThematicBreak,
LinkDefinition,
CodeBlock {
fence_char: char,
fence_length: usize,
},
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Container {
Blockquote,
Div { fence_length: usize },
ListItem { indent: usize },
Footnote { indent: usize },
}
#[derive(Debug, PartialEq, Eq)]
pub enum Atom {
/// Inline content with unparsed inline elements.
Inline,
/// A line with no non-whitespace characters.
Blankline,
}
struct Parser<'s> {
src: &'s str,
tree: tree::Builder<Block, Atom>,
}
impl<'s> Parser<'s> {
#[must_use]
pub fn new(src: &'s str) -> Self {
Self {
src,
tree: tree::Builder::new(),
}
}
#[must_use]
pub fn parse(mut self) -> Tree {
let mut lines = lines(self.src).collect::<Vec<_>>();
let mut line_pos = 0;
loop {
let line_count = self.parse_block(&mut lines[line_pos..]);
if line_count == 0 {
break;
}
line_pos += line_count;
}
self.tree.finish()
}
/// Recursively parse a block and all of its children. Return number of lines the block uses.
fn parse_block(&mut self, lines: &mut [Span]) -> usize {
let blanklines = lines
.iter()
.take_while(|sp| sp.of(self.src).trim().is_empty())
.map(|sp| {
self.tree.elem(Atom::Blankline, *sp);
})
.count();
let lines = &mut lines[blanklines..];
Block::parse(lines.iter().map(|sp| (sp.of(self.src), sp.start()))).map_or(
0,
|(kind, span, len)| {
match &kind {
Block::Leaf(_) => {
self.tree.enter(kind, span);
lines[0] = lines[0].with_start(span.end());
for line in lines.iter().take(len) {
self.tree.elem(Atom::Inline, *line);
}
}
Block::Container(c) => {
let (skip_chars, skip_lines_suffix) = match &c {
Blockquote => (1, 0),
ListItem { indent } | Footnote { indent } => (*indent, 0),
Div { .. } => (0, 1),
};
let line_count = lines.len() - skip_lines_suffix;
// update spans, remove indentation / container prefix
lines[0] = lines[0].with_start(span.end());
lines.iter_mut().skip(1).take(line_count).for_each(|sp| {
let skip = (sp
.of(self.src)
.chars()
.take_while(|c| c.is_whitespace())
.count()
+ skip_chars)
.min(sp.len());
*sp = sp.trim_start(skip);
});
self.tree.enter(kind, span);
let mut l = 0;
while l < line_count {
l += self.parse_block(&mut lines[l..line_count]);
}
}
}
self.tree.exit();
blanklines + len
},
)
}
}
impl Block {
/// Parse a single block. Return number of lines the block uses.
fn parse<'b, I: Iterator<Item = (&'b str, usize)>>(
mut lines: I,
) -> Option<(Block, Span, usize)> {
if let Some((l, start)) = lines.next() {
let (kind, sp) = Block::start(l);
let line_count = 1 + lines.take_while(|(l, _)| kind.continues(l)).count();
Some((kind, sp.translate(start), line_count))
} else {
None
}
}
/// Determine what type of block a line can start.
fn start(line: &str) -> (Block, Span) {
let start = line.chars().take_while(|c| c.is_whitespace()).count();
let line = &line[start..];
let mut chars = line.chars();
match chars.next().unwrap_or(EOF) {
'#' => chars
.find(|c| *c != '#')
.map_or(true, char::is_whitespace)
.then(|| {
let span = Span::by_len(start, line.len() - chars.as_str().len() - 1);
(Self::Leaf(Heading { level: span.len() }), span)
}),
'>' => chars.next().map_or(true, |c| c == ' ').then(|| {
(
Self::Container(Blockquote),
Span::by_len(start, line.len() - chars.as_str().len() - 1),
)
}),
f @ ':' => {
let fence_length = chars.take_while(|c| *c == f).count() + 1;
(fence_length >= 3).then(|| {
(
Self::Container(Div { fence_length }),
Span::by_len(start, line.len()),
)
})
}
fence_char @ ('`' | '~') => {
let fence_length = chars.take_while(|c| *c == fence_char).count() + 1;
(fence_length >= 3).then(|| {
(
Self::Leaf(CodeBlock {
fence_char,
fence_length,
}),
Span::by_len(start, line.len()),
)
})
}
_ => {
let thematic_break = || {
let mut without_whitespace = line.chars().filter(|c| !c.is_whitespace());
let length = without_whitespace.clone().count();
(length >= 3
&& (without_whitespace.clone().all(|c| c == '-')
|| without_whitespace.all(|c| c == '*')))
.then(|| (Self::Leaf(ThematicBreak), Span::by_len(start, line.len())))
};
thematic_break()
}
}
.unwrap_or((Self::Leaf(Paragraph), Span::new(0, 0)))
}
/// Determine if this line continues a block of a certain type.
fn continues(&self, line: &str) -> bool {
match self {
Self::Leaf(Paragraph | Heading { .. } | Table | LinkDefinition) => {
!line.trim().is_empty()
}
Self::Leaf(Attributes | ThematicBreak) => false,
Self::Leaf(CodeBlock {
fence_char,
fence_length,
}) => !line.chars().take(*fence_length).all(|c| c == *fence_char),
Self::Container(Blockquote) => line.trim().starts_with('>'),
Self::Container(Footnote { indent } | ListItem { indent }) => {
let spaces = line.chars().take_while(|c| c.is_whitespace()).count();
!line.trim().is_empty() && spaces >= *indent
}
Self::Container(Div { fence_length }) => {
line.chars().take(*fence_length).all(|c| c == ':')
}
}
}
}
impl std::fmt::Display for Block {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Block::Leaf(e) => std::fmt::Debug::fmt(e, f),
Block::Container(c) => std::fmt::Debug::fmt(c, f),
}
}
}
impl std::fmt::Display for Atom {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Inline")
}
}
/// Similar to `std::str::split('\n')` but newline is included and spans are used instead of `str`.
fn lines(src: &str) -> impl Iterator<Item = Span> + '_ {
let mut chars = src.chars();
std::iter::from_fn(move || {
if chars.as_str().is_empty() {
None
} else {
let start = src.len() - chars.as_str().len();
chars.find(|c| *c == '\n');
let end = src.len() - chars.as_str().len();
if start == end {
None
} else {
Some(Span::new(start, end))
}
}
})
}
#[cfg(test)]
mod test {
use crate::tree::Event;
use crate::Span;
use super::Atom::*;
use super::Block;
use super::Block::*;
use super::Container::*;
use super::Leaf::*;
#[test]
fn parse_elem_oneline() {
let src = "para\n";
assert_eq!(
super::Parser::new(src).parse().iter().collect::<Vec<_>>(),
&[
Event::Enter(&Leaf(Paragraph), Span::new(0, 0)),
Event::Element(&Inline, Span::new(0, 5)),
Event::Exit,
],
);
}
#[test]
fn parse_elem_multiline() {
let src = "para\npara\n";
assert_eq!(
super::Parser::new(src).parse().iter().collect::<Vec<_>>(),
&[
Event::Enter(&Leaf(Paragraph), Span::new(0, 0)),
Event::Element(&Inline, Span::new(0, 5)),
Event::Element(&Inline, Span::new(5, 10)),
Event::Exit,
],
);
}
#[test]
fn parse_elem_multi() {
let src = concat!(
"# 2\n",
"\n",
" # 8\n",
" 12\n",
"15\n", //
);
assert_eq!(
super::Parser::new(src).parse().iter().collect::<Vec<_>>(),
&[
Event::Enter(&Leaf(Heading { level: 1 }), Span::new(0, 1)),
Event::Element(&Inline, Span::new(1, 4)),
Event::Exit,
Event::Element(&Blankline, Span::new(4, 5)),
Event::Enter(&Leaf(Heading { level: 1 }), Span::new(6, 7)),
Event::Element(&Inline, Span::new(7, 10)),
Event::Element(&Inline, Span::new(10, 15)),
Event::Element(&Inline, Span::new(15, 18)),
Event::Exit,
],
);
}
#[test]
fn parse_container() {
let src = concat!(
"> a\n",
">\n",
"> ## hl\n",
">\n",
"> para\n", //
);
assert_eq!(
super::Parser::new(src).parse().iter().collect::<Vec<_>>(),
&[
Event::Enter(&Container(Blockquote), Span::new(0, 1)),
Event::Enter(&Leaf(Paragraph), Span::new(1, 1)),
Event::Element(&Inline, Span::new(1, 4)),
Event::Exit,
Event::Element(&Blankline, Span::new(5, 6)),
Event::Enter(&Leaf(Heading { level: 2 }), Span::new(8, 10)),
Event::Element(&Inline, Span::new(10, 14)),
Event::Exit,
Event::Element(&Blankline, Span::new(15, 16)),
Event::Enter(&Leaf(Paragraph), Span::new(17, 17)),
Event::Element(&Inline, Span::new(17, 23)),
Event::Exit,
Event::Exit,
]
);
}
#[test]
fn block_multiline() {
let src = "# heading\n spanning two lines\n";
let lines = super::lines(src).map(|sp| (sp.of(src), sp.start()));
let (kind, sp, len) = Block::parse(lines).unwrap();
assert_eq!(kind, Block::Leaf(Heading { level: 1 }));
assert_eq!(sp.of(src), "#");
assert_eq!(len, 2);
}
#[test]
fn block_container() {
let src = concat!(
"> a\n",
">\n",
" > b\n",
">\n",
"> c\n", //
);
let lines = super::lines(src).map(|sp| (sp.of(src), sp.start()));
let (kind, sp, len) = Block::parse(lines).unwrap();
assert_eq!(kind, Block::Container(Blockquote));
assert_eq!(sp.of(src), ">");
assert_eq!(len, 5);
}
}

10
src/lib.rs Normal file
View file

@ -0,0 +1,10 @@
mod block;
mod span;
mod tree;
pub use block::parse;
pub use block::Tree;
const EOF: char = '\0';
use span::Span;

10
src/main.rs Normal file
View file

@ -0,0 +1,10 @@
use std::io::Read;
fn main() {
let mut src = String::new();
std::io::stdin()
.read_to_string(&mut src)
.expect("failed to read unicode file");
print!("{}", jotdown::parse(&src));
}

49
src/span.rs Normal file
View file

@ -0,0 +1,49 @@
#[derive(Clone, Copy, Default, Debug, PartialEq, Eq)]
pub struct Span {
start: u32,
end: u32,
}
impl Span {
pub fn new(start: usize, end: usize) -> Self {
Self::by_len(start, end.checked_sub(start).unwrap())
}
pub fn by_len(start: usize, len: usize) -> Self {
Self {
start: start.try_into().unwrap(),
end: start.checked_add(len).unwrap().try_into().unwrap(),
}
}
pub fn with_start(self, start: usize) -> Self {
Self::new(start, self.end())
}
pub fn trim_start(self, n: usize) -> Self {
Self::new(self.start().checked_add(n).unwrap(), self.end())
}
pub fn translate(self, n: usize) -> Self {
Self::new(
self.start().checked_add(n).unwrap(),
self.end().checked_add(n).unwrap(),
)
}
pub fn start(self) -> usize {
self.start.try_into().unwrap()
}
pub fn end(self) -> usize {
self.end.try_into().unwrap()
}
pub fn len(self) -> usize {
self.end() - self.start()
}
pub fn of(self, s: &str) -> &str {
&s[self.start()..self.end()]
}
}

268
src/tree.rs Normal file
View file

@ -0,0 +1,268 @@
use crate::Span;
#[derive(Debug)]
pub struct Tree<C, E> {
nodes: Vec<Node<C, E>>,
}
impl<C, E> Tree<C, E> {
fn new(nodes: Vec<Node<C, E>>) -> Self {
Self { nodes }
}
pub fn iter(&self) -> Iter<C, E> {
self.into()
}
}
#[derive(Debug, PartialEq, Eq)]
pub enum Event<'a, C, E> {
Enter(&'a C, Span),
Element(&'a E, Span),
Exit,
}
pub struct Iter<'a, C, E> {
nodes: &'a [Node<C, E>],
branch: Vec<NodeIndex>,
head: Option<NodeIndex>,
}
impl<'a, C, E> Iterator for Iter<'a, C, E> {
type Item = Event<'a, C, E>;
fn next(&mut self) -> Option<Self::Item> {
if let Some(head) = self.head {
let n = &self.nodes[head.index()];
match &n.kind {
NodeKind::Root => {
self.head = n.next;
self.next()
}
NodeKind::Container(c, child) => {
self.branch.push(head);
self.head = *child;
Some(Event::Enter(c, n.span))
}
NodeKind::Element(e) => {
self.head = n.next;
Some(Event::Element(e, n.span))
}
}
} else if let Some(block_ni) = self.branch.pop() {
let Node { next, .. } = &self.nodes[block_ni.index()];
self.head = *next;
Some(Event::Exit)
} else {
None
}
}
}
impl<'a, C, E> From<&'a Tree<C, E>> for Iter<'a, C, E> {
fn from(tree: &'a Tree<C, E>) -> Self {
Self {
nodes: &tree.nodes,
branch: Vec::new(),
head: Some(NodeIndex::root()),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct NodeIndex(std::num::NonZeroUsize);
impl NodeIndex {
fn new(i: usize) -> Self {
assert_ne!(i, usize::MAX);
Self((i + 1).try_into().unwrap())
}
fn root() -> Self {
Self::new(0)
}
fn index(self) -> usize {
usize::from(self.0) - 1
}
}
#[derive(Debug, Clone)]
enum NodeKind<C, E> {
Root,
Container(C, Option<NodeIndex>),
Element(E),
}
#[derive(Debug, Clone)]
struct Node<C, E> {
span: Span,
kind: NodeKind<C, E>,
next: Option<NodeIndex>,
}
#[derive(Debug, Clone)]
pub struct Builder<C, E> {
nodes: Vec<Node<C, E>>,
branch: Vec<NodeIndex>,
head: Option<NodeIndex>,
}
impl<C, E> Builder<C, E> {
pub(super) fn new() -> Self {
Builder {
nodes: vec![Node {
span: Span::default(),
kind: NodeKind::Root,
next: None,
}],
branch: vec![],
head: Some(NodeIndex::root()),
}
}
pub(super) fn elem(&mut self, e: E, span: Span) {
self.add_node(Node {
span,
kind: NodeKind::Element(e),
next: None,
});
}
pub(super) fn enter(&mut self, c: C, span: Span) {
self.add_node(Node {
span,
kind: NodeKind::Container(c, None),
next: None,
});
}
pub(super) fn exit(&mut self) {
if self.head.is_some() {
self.head = None;
} else {
let last = self.branch.pop();
assert_ne!(last, None);
}
}
pub(super) fn finish(self) -> Tree<C, E> {
Tree::new(self.nodes)
}
fn add_node(&mut self, node: Node<C, E>) {
let ni = NodeIndex::new(self.nodes.len());
self.nodes.push(node);
if let Some(head_ni) = &mut self.head {
let mut head = &mut self.nodes[head_ni.index()];
match &mut head.kind {
NodeKind::Root | NodeKind::Element(_) => {
// update next pointer of previous node
assert_eq!(head.next, None);
head.next = Some(ni);
}
NodeKind::Container(_, child) => {
self.branch.push(*head_ni);
// update child pointer of current container
assert_eq!(*child, None);
*child = Some(ni);
}
}
} else if let Some(block) = self.branch.pop() {
let mut block = &mut self.nodes[block.index()];
assert!(matches!(block.kind, NodeKind::Container(..)));
block.next = Some(ni);
} else {
panic!()
}
self.head = Some(ni);
}
}
impl<C: std::fmt::Display + Clone, E: std::fmt::Display + Clone> std::fmt::Display
for Builder<C, E>
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.clone().finish().fmt(f)
}
}
impl<C: std::fmt::Display, E: std::fmt::Display> std::fmt::Display for Tree<C, E> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
const INDENT: &str = " ";
let mut level = 0;
for e in self.iter() {
let indent = INDENT.repeat(level);
match e {
Event::Enter(container, sp) => {
writeln!(f, "{}{} ({}:{})", indent, container, sp.start(), sp.end())?;
level += 1;
}
Event::Exit => level -= 1,
Event::Element(element, sp) => {
writeln!(f, "{}{} ({}:{})", indent, element, sp.start(), sp.end())?;
}
}
}
Ok(())
}
}
#[cfg(test)]
mod test {
use crate::Span;
#[test]
fn fmt_linear() {
let mut tree: super::Builder<u8, u8> = super::Builder::new();
tree.elem(1, Span::new(0, 1));
tree.elem(2, Span::new(1, 2));
tree.elem(3, Span::new(3, 4));
assert_eq!(
tree.to_string(),
concat!(
"1 (0:1)\n",
"2 (1:2)\n",
"3 (3:4)\n", //
)
);
}
#[test]
fn fmt_container() {
let mut tree: super::Builder<u8, u16> = super::Builder::new();
tree.enter(1, Span::new(0, 1));
tree.elem(11, Span::new(0, 1));
tree.elem(12, Span::new(0, 1));
tree.exit();
tree.enter(2, Span::new(1, 5));
tree.enter(21, Span::new(2, 5));
tree.enter(211, Span::new(3, 4));
tree.elem(2111, Span::new(3, 4));
tree.exit();
tree.exit();
tree.enter(22, Span::new(4, 5));
tree.elem(221, Span::new(4, 5));
tree.exit();
tree.exit();
tree.enter(3, Span::new(5, 6));
tree.elem(31, Span::new(5, 6));
tree.exit();
assert_eq!(
tree.to_string(),
concat!(
"1 (0:1)\n",
" 11 (0:1)\n",
" 12 (0:1)\n",
"2 (1:5)\n",
" 21 (2:5)\n",
" 211 (3:4)\n",
" 2111 (3:4)\n",
" 22 (4:5)\n",
" 221 (4:5)\n",
"3 (5:6)\n",
" 31 (5:6)\n",
)
);
}
}