jotdown/src/block.rs
2023-02-05 20:36:39 +01:00

735 lines
22 KiB
Rust

use crate::Span;
use crate::EOF;
use crate::tree;
use Atom::*;
use Container::*;
use Leaf::*;
pub type Tree = tree::Tree<Block, Atom>;
#[must_use]
pub fn parse(src: &str) -> Tree {
TreeParser::new(src).parse()
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Block {
/// An atomic block, containing no children elements.
Atom(Atom),
/// A leaf block, containing only inline elements.
Leaf(Leaf),
/// A container block, containing children blocks.
Container(Container),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Atom {
/// Inline content with unparsed inline elements.
Inline,
/// A line with no non-whitespace characters.
Blankline,
/// A list of attributes.
Attributes,
/// A thematic break.
ThematicBreak,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Leaf {
/// Span is empty, before first character of paragraph.
/// Each inline is a line.
Paragraph,
/// Span is `#` characters.
/// Each inline is a line.
Heading,
/// Span is first `|` character.
/// Each inline is a line (row).
Table,
/// Span is the link tag.
/// Inlines are lines of the URL.
LinkDefinition,
/// Span is language specifier.
/// Each inline is a line.
CodeBlock,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Container {
/// Span is first `>` character.
Blockquote,
/// Span is class specifier.
Div,
/// Span is the list marker.
ListItem,
/// Span is first `[^` instance.
Footnote,
}
/// Parser for block-level tree structure of entire document.
struct TreeParser<'s> {
src: &'s str,
tree: tree::Builder<Block, Atom>,
}
impl<'s> TreeParser<'s> {
#[must_use]
pub fn new(src: &'s str) -> Self {
Self {
src,
tree: tree::Builder::new(),
}
}
#[must_use]
pub fn parse(mut self) -> Tree {
let mut lines = lines(self.src).collect::<Vec<_>>();
let mut line_pos = 0;
while line_pos < lines.len() {
let line_count = self.parse_block(&mut lines[line_pos..]);
if line_count == 0 {
break;
}
line_pos += line_count;
}
self.tree.finish()
}
/// Recursively parse a block and all of its children. Return number of lines the block uses.
fn parse_block(&mut self, lines: &mut [Span]) -> usize {
BlockParser::parse(lines.iter().map(|sp| sp.of(self.src))).map_or(
0,
|(indent, kind, span, line_count)| {
let lines = {
let l = lines.len().min(line_count);
&mut lines[..l]
};
let truncated = lines.len() < line_count;
let span = span.translate(lines[0].start());
// skip part of first inline that is shared with the block span
lines[0] = lines[0].with_start(span.end());
// remove junk from footnotes / link defs
if matches!(
kind,
Block::Leaf(LinkDefinition) | Block::Container(Footnote { .. })
) {
assert_eq!(&lines[0].of(self.src).chars().as_str()[0..2], "]:");
lines[0] = lines[0].skip(2);
}
// skip closing fence of code blocks / divs
let lines = if !truncated
&& matches!(kind, Block::Leaf(CodeBlock) | Block::Container(Div))
{
let l = lines.len();
&mut lines[..l - 1]
} else {
lines
};
match kind {
Block::Atom(a) => {
assert_ne!(a, Inline);
self.tree.atom(a, span);
}
Block::Leaf(l) => {
self.tree.enter(kind, span);
// trim starting whitespace of the block contents
lines[0] = lines[0].trim_start(self.src);
// skip first inline if empty (e.g. code block)
let lines = if lines[0].is_empty() {
&mut lines[1..]
} else {
lines
};
// trim ending whitespace of block if not verbatim
if !matches!(l, Leaf::CodeBlock) {
let l = lines.len();
if l > 0 {
let last = &mut lines[l - 1];
*last = last.trim_end(self.src);
}
}
lines.iter().for_each(|line| self.tree.atom(Inline, *line));
self.tree.exit();
}
Block::Container(c) => {
let (skip_chars, skip_lines_suffix) = match c {
Blockquote => (2, 0),
ListItem | Footnote => (indent, 0),
Div => (0, 1),
};
let line_count_inner = lines.len() - skip_lines_suffix;
// update spans, remove indentation / container prefix
lines
.iter_mut()
.skip(1)
.take(line_count_inner)
.for_each(|sp| {
let skip = (sp
.of(self.src)
.chars()
.take_while(|c| c.is_whitespace())
.count()
+ skip_chars)
.min(sp.len() - usize::from(sp.of(self.src).ends_with('\n')));
*sp = sp.skip(skip);
});
self.tree.enter(kind, span);
let mut l = 0;
while l < line_count_inner {
l += self.parse_block(&mut lines[l..line_count_inner]);
}
self.tree.exit();
}
}
line_count
},
)
}
}
/// Parser for a single block.
struct BlockParser {
indent: usize,
kind: Block,
span: Span,
fence: Option<(char, usize)>,
}
impl BlockParser {
/// Parse a single block. Return number of lines the block uses.
fn parse<'s, I: Iterator<Item = &'s str>>(mut lines: I) -> Option<(usize, Block, Span, usize)> {
lines.next().map(|l| {
let mut p = BlockParser::new(l);
let has_end_delimiter =
matches!(p.kind, Block::Leaf(CodeBlock) | Block::Container(Div));
let line_count_match = lines.take_while(|l| p.continues(l)).count();
let line_count = 1 + line_count_match + usize::from(has_end_delimiter);
(p.indent, p.kind, p.span, line_count)
})
}
fn new(line: &str) -> Self {
let start = line
.chars()
.take_while(|c| *c != '\n' && c.is_whitespace())
.count();
let line_t = &line[start..];
let mut chars = line_t.chars();
let mut fence = None;
let (kind, span) = match chars.next().unwrap_or(EOF) {
EOF => Some((Block::Atom(Blankline), Span::empty_at(start))),
'\n' => Some((Block::Atom(Blankline), Span::by_len(start, 1))),
'#' => chars
.find(|c| *c != '#')
.map_or(true, char::is_whitespace)
.then(|| {
(
Block::Leaf(Heading),
Span::by_len(start, line_t.len() - chars.as_str().len() - 1),
)
}),
'>' => {
if let Some(c) = chars.next() {
c.is_whitespace().then(|| {
(
Block::Container(Blockquote),
Span::by_len(start, line_t.len() - chars.as_str().len() - 1),
)
})
} else {
Some((
Block::Container(Blockquote),
Span::by_len(start, line_t.len() - chars.as_str().len()),
))
}
}
'|' => (&line_t[line_t.len() - 1..] == "|"
&& &line_t[line_t.len() - 2..line_t.len() - 1] != "\\")
.then(|| (Block::Leaf(Table), Span::by_len(start, 1))),
'[' => chars.as_str().find("]:").map(|l| {
let tag = &chars.as_str()[0..l];
let (tag, is_footnote) = if let Some(tag) = tag.strip_prefix('^') {
(tag, true)
} else {
(tag, false)
};
dbg!(line, line_t, tag);
(
if is_footnote {
Block::Container(Footnote)
} else {
Block::Leaf(LinkDefinition)
},
Span::from_slice(line, tag),
)
}),
'-' | '*' if Self::is_thematic_break(chars.clone()) => Some((
Block::Atom(ThematicBreak),
Span::from_slice(line, line_t.trim()),
)),
'-' => chars.next().map_or(true, char::is_whitespace).then(|| {
let task_list = chars.next() == Some('[')
&& matches!(chars.next(), Some('X' | ' '))
&& chars.next() == Some(']')
&& chars.next().map_or(true, char::is_whitespace);
(
Block::Container(ListItem),
Span::by_len(start, if task_list { 3 } else { 1 }),
)
}),
'+' | '*' | ':' if chars.next().map_or(true, char::is_whitespace) => {
Some((Block::Container(ListItem), Span::by_len(start, 1)))
}
f @ ('`' | ':' | '~') => {
let fence_length = (&mut chars).take_while(|c| *c == f).count() + 1;
fence = Some((f, fence_length));
let lang = line_t[fence_length..].trim();
let valid_spec =
!lang.chars().any(char::is_whitespace) && !lang.chars().any(|c| c == '`');
(valid_spec && fence_length >= 3).then(|| {
(
match f {
':' => Block::Container(Div),
_ => Block::Leaf(CodeBlock),
},
Span::from_slice(line, lang),
)
})
}
_ => None,
}
.unwrap_or((Block::Leaf(Paragraph), Span::new(0, 0)));
Self {
indent: start,
kind,
span,
fence,
}
}
fn is_thematic_break(chars: std::str::Chars) -> bool {
let mut n = 1;
for c in chars {
if matches!(c, '-' | '*') {
n += 1;
} else if !c.is_whitespace() {
return false;
}
}
n >= 3
}
/// Determine if this line continues the block.
fn continues(&mut self, line: &str) -> bool {
match self.kind {
Block::Atom(..) => false,
Block::Leaf(Paragraph | Heading | Table) => !line.trim().is_empty(),
Block::Leaf(LinkDefinition) => line.starts_with(' ') && !line.trim().is_empty(),
Block::Container(Blockquote) => line.trim().starts_with('>'),
Block::Container(Footnote | ListItem) => {
let spaces = line.chars().take_while(|c| c.is_whitespace()).count();
line.trim().is_empty() || spaces > self.indent
}
Block::Container(Div) | Block::Leaf(CodeBlock) => {
let (fence, fence_length) = self.fence.unwrap();
let mut c = line.chars();
!((&mut c).take(fence_length).all(|c| c == fence)
&& c.next().map_or(true, char::is_whitespace))
}
}
}
}
impl std::fmt::Display for Block {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Block::Atom(a) => std::fmt::Debug::fmt(a, f),
Block::Leaf(e) => std::fmt::Debug::fmt(e, f),
Block::Container(c) => std::fmt::Debug::fmt(c, f),
}
}
}
impl std::fmt::Display for Atom {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Inline")
}
}
/// Similar to `std::str::split('\n')` but newline is included and spans are used instead of `str`.
fn lines(src: &str) -> impl Iterator<Item = Span> + '_ {
let mut chars = src.chars();
std::iter::from_fn(move || {
if chars.as_str().is_empty() {
None
} else {
let start = src.len() - chars.as_str().len();
chars.find(|c| *c == '\n');
let end = src.len() - chars.as_str().len();
if start == end {
None
} else {
Some(Span::new(start, end))
}
}
})
}
#[cfg(test)]
mod test {
use crate::tree::EventKind;
use crate::tree::EventKind::*;
use super::Atom::*;
use super::Block;
use super::Block::*;
use super::Container::*;
use super::Leaf::*;
macro_rules! test_parse {
($src:expr $(,$($event:expr),* $(,)?)?) => {
let t = super::TreeParser::new($src).parse();
let actual = t.map(|ev| (ev.kind, ev.span.of($src))).collect::<Vec<_>>();
let expected = &[$($($event),*,)?];
assert_eq!(actual, expected, "\n\n{}\n\n", $src);
};
}
#[test]
fn parse_para_oneline() {
test_parse!(
"para\n",
(Enter(Leaf(Paragraph)), ""),
(EventKind::Atom(Inline), "para"),
(Exit(Leaf(Paragraph)), ""),
);
}
#[test]
fn parse_para_multiline() {
test_parse!(
"para0\npara1\n",
(Enter(Leaf(Paragraph)), ""),
(EventKind::Atom(Inline), "para0\n"),
(EventKind::Atom(Inline), "para1"),
(Exit(Leaf(Paragraph)), ""),
);
}
#[test]
fn parse_heading_multi() {
test_parse!(
concat!(
"# 2\n",
"\n",
" # 8\n",
" 12\n",
"15\n", //
),
(Enter(Leaf(Heading)), "#"),
(EventKind::Atom(Inline), "2"),
(Exit(Leaf(Heading)), "#"),
(EventKind::Atom(Blankline), "\n"),
(Enter(Leaf(Heading)), "#"),
(EventKind::Atom(Inline), "8\n"),
(EventKind::Atom(Inline), " 12\n"),
(EventKind::Atom(Inline), "15"),
(Exit(Leaf(Heading)), "#"),
);
}
#[test]
fn parse_blockquote() {
test_parse!(
"> a\n",
(Enter(Container(Blockquote)), ">"),
(Enter(Leaf(Paragraph)), ""),
(EventKind::Atom(Inline), "a"),
(Exit(Leaf(Paragraph)), ""),
(Exit(Container(Blockquote)), ">"),
);
test_parse!(
"> \n",
(Enter(Container(Blockquote)), ">"),
(EventKind::Atom(Blankline), "\n"),
(Exit(Container(Blockquote)), ">"),
);
test_parse!(
">",
(Enter(Container(Blockquote)), ">"),
(EventKind::Atom(Blankline), ""),
(Exit(Container(Blockquote)), ">"),
);
test_parse!(
concat!(
"> a\n",
">\n",
"> ## hl\n",
">\n",
"> para\n", //
),
(Enter(Container(Blockquote)), ">"),
(Enter(Leaf(Paragraph)), ""),
(EventKind::Atom(Inline), "a"),
(Exit(Leaf(Paragraph)), ""),
(EventKind::Atom(Blankline), "\n"),
(Enter(Leaf(Heading)), "##"),
(EventKind::Atom(Inline), "hl"),
(Exit(Leaf(Heading)), "##"),
(EventKind::Atom(Blankline), "\n"),
(Enter(Leaf(Paragraph)), ""),
(EventKind::Atom(Inline), "para"),
(Exit(Leaf(Paragraph)), ""),
(Exit(Container(Blockquote)), ">"),
);
}
#[test]
fn parse_blockquote_empty() {
test_parse!(
"> \n",
(Enter(Container(Blockquote)), ">"),
(EventKind::Atom(Blankline), "\n"),
(Exit(Container(Blockquote)), ">"),
);
test_parse!(
">",
(Enter(Container(Blockquote)), ">"),
(EventKind::Atom(Blankline), ""),
(Exit(Container(Blockquote)), ">"),
);
}
#[test]
fn parse_code_block() {
test_parse!(
concat!("```\n", "l0\n"),
(Enter(Leaf(CodeBlock)), "",),
(EventKind::Atom(Inline), "l0\n"),
(Exit(Leaf(CodeBlock)), "",),
);
test_parse!(
concat!(
"```\n",
"l0\n",
"```\n",
"\n",
"para\n", //
),
(Enter(Leaf(CodeBlock)), ""),
(EventKind::Atom(Inline), "l0\n"),
(Exit(Leaf(CodeBlock)), ""),
(EventKind::Atom(Blankline), "\n"),
(Enter(Leaf(Paragraph)), ""),
(EventKind::Atom(Inline), "para"),
(Exit(Leaf(Paragraph)), ""),
);
test_parse!(
concat!(
"```` lang\n",
"l0\n",
"```\n",
" l1\n",
"````", //
),
(Enter(Leaf(CodeBlock)), "lang"),
(EventKind::Atom(Inline), "l0\n"),
(EventKind::Atom(Inline), "```\n"),
(EventKind::Atom(Inline), " l1\n"),
(Exit(Leaf(CodeBlock)), "lang"),
);
test_parse!(
concat!(
"```\n", //
"a\n", //
"```\n", //
"```\n", //
"bbb\n", //
"```\n", //
),
(Enter(Leaf(CodeBlock)), ""),
(EventKind::Atom(Inline), "a\n"),
(Exit(Leaf(CodeBlock)), ""),
(Enter(Leaf(CodeBlock)), ""),
(EventKind::Atom(Inline), "bbb\n"),
(Exit(Leaf(CodeBlock)), ""),
);
test_parse!(
concat!(
"~~~\n",
"code\n",
" block\n",
"~~~\n", //
),
(Enter(Leaf(CodeBlock)), "",),
(EventKind::Atom(Inline), "code\n"),
(EventKind::Atom(Inline), " block\n"),
(Exit(Leaf(CodeBlock)), "",),
);
}
#[test]
fn parse_link_definition() {
test_parse!(
"[tag]: url\n",
(Enter(Leaf(LinkDefinition)), "tag"),
(EventKind::Atom(Inline), "url"),
(Exit(Leaf(LinkDefinition)), "tag"),
);
}
#[test]
fn parse_footnote() {
test_parse!(
"[^tag]: description\n",
(Enter(Container(Footnote)), "tag"),
(Enter(Leaf(Paragraph)), ""),
(EventKind::Atom(Inline), "description"),
(Exit(Leaf(Paragraph)), ""),
(Exit(Container(Footnote)), "tag"),
);
}
macro_rules! test_block {
($src:expr, $kind:expr, $str:expr, $len:expr $(,)?) => {
let lines = super::lines($src).map(|sp| sp.of($src));
let (_indent, kind, sp, len) = super::BlockParser::parse(lines).unwrap();
assert_eq!(
(kind, sp.of($src), len),
($kind, $str, $len),
"\n\n{}\n\n",
$src
);
};
}
#[test]
fn block_blankline() {
test_block!("\n", Block::Atom(Blankline), "\n", 1);
test_block!(" \n", Block::Atom(Blankline), "\n", 1);
}
#[test]
fn block_multiline() {
test_block!("# heading\n spanning two lines\n", Leaf(Heading), "#", 2);
}
#[test]
fn block_blockquote() {
test_block!(
concat!(
"> a\n", //
">\n", //
" > b\n", //
">\n", //
"> c\n", //
),
Block::Container(Blockquote),
">",
5,
);
}
#[test]
fn block_thematic_break() {
test_block!("---\n", Block::Atom(ThematicBreak), "---", 1);
test_block!(
concat!(
" -*- -*-\n",
"\n", //
"para", //
),
Block::Atom(ThematicBreak),
"-*- -*-",
1
);
}
#[test]
fn block_code_block() {
test_block!(
concat!(
"```` lang\n",
"l0\n",
"```\n",
" l1\n",
"````", //
),
Leaf(CodeBlock),
"lang",
5,
);
test_block!(
concat!(
"```\n", //
"a\n", //
"```\n", //
"```\n", //
"bbb\n", //
"```\n", //
),
Leaf(CodeBlock),
"",
3,
);
test_block!(
concat!(
"``` no space in lang specifier\n",
"l0\n",
"```\n", //
),
Leaf(Paragraph),
"",
3,
);
}
#[test]
fn block_link_definition() {
test_block!("[tag]: url\n", Leaf(LinkDefinition), "tag", 1);
test_block!(
concat!(
"[tag]: uuu\n",
" rl\n", //
),
Leaf(LinkDefinition),
"tag",
2,
);
test_block!(
concat!(
"[tag]: url\n",
"para\n", //
),
Leaf(LinkDefinition),
"tag",
1,
);
}
}