jotdown/src/block.rs

554 lines
18 KiB
Rust
Raw Normal View History

2022-11-12 12:45:17 -05:00
use crate::Span;
use crate::EOF;
use crate::tree;
use Container::*;
use Leaf::*;
pub type Tree = tree::Tree<Block, Atom>;
pub fn parse(src: &str) -> Tree {
Parser::new(src).parse()
}
2022-11-28 14:12:49 -05:00
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2022-11-12 12:45:17 -05:00
pub enum Block {
Leaf(Leaf),
Container(Container),
}
2022-11-28 14:12:49 -05:00
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2022-11-12 12:45:17 -05:00
pub enum Leaf {
Paragraph,
2022-11-27 15:59:54 -05:00
Heading { level: u8 },
2022-11-12 12:45:17 -05:00
Attributes,
Table,
LinkDefinition,
2022-11-27 15:59:54 -05:00
CodeBlock { fence_length: u8 },
2022-11-28 14:12:49 -05:00
ThematicBreak,
2022-11-12 12:45:17 -05:00
}
2022-11-28 14:12:49 -05:00
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2022-11-12 12:45:17 -05:00
pub enum Container {
Blockquote,
2022-11-27 15:59:54 -05:00
Div { fence_length: u8 },
ListItem { indent: u8 },
Footnote { indent: u8 },
2022-11-12 12:45:17 -05:00
}
2022-11-28 14:12:49 -05:00
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2022-11-12 12:45:17 -05:00
pub enum Atom {
/// Inline content with unparsed inline elements.
Inline,
/// A line with no non-whitespace characters.
Blankline,
}
struct Parser<'s> {
src: &'s str,
tree: tree::Builder<Block, Atom>,
}
impl<'s> Parser<'s> {
#[must_use]
pub fn new(src: &'s str) -> Self {
Self {
src,
tree: tree::Builder::new(),
}
}
#[must_use]
pub fn parse(mut self) -> Tree {
let mut lines = lines(self.src).collect::<Vec<_>>();
let mut line_pos = 0;
2022-11-28 14:12:49 -05:00
while line_pos < lines.len() {
2022-11-12 12:45:17 -05:00
let line_count = self.parse_block(&mut lines[line_pos..]);
if line_count == 0 {
break;
}
line_pos += line_count;
}
self.tree.finish()
}
/// Recursively parse a block and all of its children. Return number of lines the block uses.
fn parse_block(&mut self, lines: &mut [Span]) -> usize {
let blanklines = lines
.iter()
.take_while(|sp| sp.of(self.src).trim().is_empty())
.map(|sp| {
self.tree.elem(Atom::Blankline, *sp);
})
.count();
let lines = &mut lines[blanklines..];
2022-11-28 14:30:18 -05:00
Block::parse(lines.iter().map(|sp| sp.of(self.src))).map_or(
blanklines,
|(kind, span, line_count)| {
2022-11-28 14:30:18 -05:00
let start = lines.get(0).map(|sp| sp.start()).unwrap();
let span = span.translate(start);
match &kind {
Block::Leaf(l) => {
self.tree.enter(kind, span);
let first = &mut lines[0];
*first = first.with_start(span.end());
// trim starting whitespace of block
*first = Span::new(
first.end() - first.of(self.src).trim_start().len(),
first.end(),
);
let line_count = match l {
CodeBlock { .. } => line_count - 1,
_ => line_count,
2022-11-28 14:30:18 -05:00
};
if !matches!(l, Leaf::CodeBlock { .. }) {
// trim ending whitespace of block
let last = &mut lines[line_count - 1];
*last = last.with_len(last.of(self.src).trim_end().len());
}
for line in &lines[0..line_count] {
2022-11-28 14:30:18 -05:00
self.tree.elem(Atom::Inline, *line);
}
2022-11-12 12:45:17 -05:00
}
2022-11-28 14:30:18 -05:00
Block::Container(c) => {
let (skip_chars, skip_lines_suffix) = match &c {
Blockquote => (2, 0),
ListItem { indent } | Footnote { indent } => (*indent, 0),
Div { .. } => (0, 1),
};
let line_count_inner = lines.len() - skip_lines_suffix;
// update spans, remove indentation / container prefix
lines[0] = lines[0].with_start(span.end());
lines
.iter_mut()
.skip(1)
.take(line_count_inner)
.for_each(|sp| {
let skip = (sp
.of(self.src)
.chars()
.take_while(|c| c.is_whitespace())
.count()
+ usize::from(skip_chars))
.min(sp.len());
*sp = sp.trim_start(skip);
});
self.tree.enter(kind, span);
let mut l = 0;
while l < line_count_inner {
l += self.parse_block(&mut lines[l..line_count_inner]);
}
2022-11-12 12:45:17 -05:00
}
}
2022-11-28 14:30:18 -05:00
self.tree.exit();
blanklines + line_count
2022-11-28 14:30:18 -05:00
},
)
2022-11-12 12:45:17 -05:00
}
}
impl Block {
/// Parse a single block. Return number of lines the block uses.
2022-11-27 15:59:54 -05:00
fn parse<'b, I: Iterator<Item = &'b str>>(mut lines: I) -> Option<(Block, Span, usize)> {
lines.next().map(|l| {
2022-11-12 12:45:17 -05:00
let (kind, sp) = Block::start(l);
2022-11-28 14:30:18 -05:00
let has_end_delimiter = matches!(
kind,
Self::Leaf(CodeBlock { .. }) | Self::Container(Div { .. })
);
let line_count_match = lines.take_while(|l| kind.continues(l)).count();
let line_count = 1 + line_count_match + usize::from(has_end_delimiter);
2022-11-27 15:59:54 -05:00
(kind, sp, line_count)
})
2022-11-12 12:45:17 -05:00
}
/// Determine what type of block a line can start.
2022-11-27 15:59:54 -05:00
fn start(line: &str) -> (Self, Span) {
2022-11-12 12:45:17 -05:00
let start = line.chars().take_while(|c| c.is_whitespace()).count();
let line = &line[start..];
let mut chars = line.chars();
match chars.next().unwrap_or(EOF) {
'#' => chars
.find(|c| *c != '#')
.map_or(true, char::is_whitespace)
.then(|| {
2022-11-27 15:59:54 -05:00
u8::try_from(line.len() - chars.as_str().len() - 1)
.ok()
.map(|level| {
(
Self::Leaf(Heading { level }),
Span::by_len(start, level.into()),
)
})
})
.flatten(),
'>' => {
if let Some(c) = chars.next() {
c.is_whitespace().then(|| {
(
Self::Container(Blockquote),
Span::by_len(start, line.len() - chars.as_str().len() - 1),
)
})
} else {
Some((
Self::Container(Blockquote),
Span::by_len(start, line.len() - chars.as_str().len()),
))
}
}
2022-11-27 15:59:54 -05:00
f @ ('`' | ':') => {
let fence_length = (&mut chars).take_while(|c| *c == f).count() + 1;
let valid_spec = !line[fence_length..].trim().chars().any(char::is_whitespace);
(valid_spec && fence_length >= 3)
2022-11-27 15:59:54 -05:00
.then(|| {
u8::try_from(fence_length).ok().map(|fence_length| {
(
match f {
'`' => Self::Leaf(CodeBlock { fence_length }),
':' => Self::Container(Div { fence_length }),
_ => unreachable!(),
},
2022-11-28 14:30:18 -05:00
Span::by_len(start, fence_length.into()),
2022-11-27 15:59:54 -05:00
)
})
})
.flatten()
2022-11-12 12:45:17 -05:00
}
_ => {
let thematic_break = || {
let mut without_whitespace = line.chars().filter(|c| !c.is_whitespace());
let length = without_whitespace.clone().count();
(length >= 3
&& (without_whitespace.clone().all(|c| c == '-')
|| without_whitespace.all(|c| c == '*')))
.then(|| (Self::Leaf(ThematicBreak), Span::by_len(start, line.len())))
};
2022-11-28 14:12:49 -05:00
thematic_break()
2022-11-12 12:45:17 -05:00
}
}
.unwrap_or((Self::Leaf(Paragraph), Span::new(0, 0)))
}
/// Determine if this line continues a block of a certain type.
2022-11-28 14:49:38 -05:00
fn continues(self, line: &str) -> bool {
//let start = Self::start(line); // TODO allow starting new block without blank line
2022-11-12 12:45:17 -05:00
match self {
Self::Leaf(Paragraph | Heading { .. } | Table | LinkDefinition) => {
!line.trim().is_empty()
}
2022-11-28 14:12:49 -05:00
Self::Leaf(Attributes | ThematicBreak) => false,
2022-11-12 12:45:17 -05:00
Self::Container(Blockquote) => line.trim().starts_with('>'),
Self::Container(Footnote { indent } | ListItem { indent }) => {
let spaces = line.chars().take_while(|c| c.is_whitespace()).count();
2022-11-28 14:49:38 -05:00
!line.trim().is_empty() && spaces >= (indent).into()
2022-11-12 12:45:17 -05:00
}
2022-11-27 15:59:54 -05:00
Self::Container(Div { fence_length }) | Self::Leaf(CodeBlock { fence_length }) => {
let fence = if matches!(self, Self::Container(..)) {
':'
} else {
'`'
};
2022-11-27 15:59:54 -05:00
let mut c = line.chars();
!((&mut c).take((fence_length).into()).all(|c| c == fence)
&& c.next().map_or(true, char::is_whitespace))
2022-11-12 12:45:17 -05:00
}
}
}
}
impl std::fmt::Display for Block {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Block::Leaf(e) => std::fmt::Debug::fmt(e, f),
Block::Container(c) => std::fmt::Debug::fmt(c, f),
}
}
}
impl std::fmt::Display for Atom {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Inline")
}
}
/// Similar to `std::str::split('\n')` but newline is included and spans are used instead of `str`.
fn lines(src: &str) -> impl Iterator<Item = Span> + '_ {
let mut chars = src.chars();
std::iter::from_fn(move || {
if chars.as_str().is_empty() {
None
} else {
let start = src.len() - chars.as_str().len();
chars.find(|c| *c == '\n');
let end = src.len() - chars.as_str().len();
if start == end {
None
} else {
Some(Span::new(start, end))
}
}
})
}
#[cfg(test)]
mod test {
2022-11-28 14:12:49 -05:00
use crate::tree::EventKind::*;
2022-11-12 12:45:17 -05:00
use super::Atom::*;
use super::Block;
use super::Block::*;
use super::Container::*;
use super::Leaf::*;
2022-11-27 16:19:15 -05:00
macro_rules! test_parse {
2022-11-28 14:19:22 -05:00
($src:expr $(,$($event:expr),* $(,)?)?) => {
let t = super::Parser::new($src).parse();
let actual = t.map(|ev| (ev.kind, ev.span.of($src))).collect::<Vec<_>>();
let expected = &[$($($event),*,)?];
assert_eq!(actual, expected, "\n\n{}\n\n", $src);
};
}
2022-11-27 16:19:15 -05:00
2022-11-12 12:45:17 -05:00
#[test]
2022-11-28 14:12:49 -05:00
fn parse_para_oneline() {
2022-11-27 16:19:15 -05:00
test_parse!(
"para\n",
2022-11-28 14:12:49 -05:00
(Enter(Leaf(Paragraph)), ""),
2022-11-28 18:33:43 -05:00
(Element(Inline), "para"),
(Exit(Leaf(Paragraph)), ""),
2022-11-12 12:45:17 -05:00
);
}
#[test]
2022-11-28 14:12:49 -05:00
fn parse_para_multiline() {
2022-11-27 16:19:15 -05:00
test_parse!(
2022-11-28 14:12:49 -05:00
"para0\npara1\n",
(Enter(Leaf(Paragraph)), ""),
(Element(Inline), "para0\n"),
2022-11-28 18:33:43 -05:00
(Element(Inline), "para1"),
(Exit(Leaf(Paragraph)), ""),
2022-11-12 12:45:17 -05:00
);
}
#[test]
2022-11-28 14:12:49 -05:00
fn parse_heading_multi() {
2022-11-27 16:19:15 -05:00
test_parse!(
concat!(
"# 2\n",
"\n",
" # 8\n",
" 12\n",
"15\n", //
),
2022-11-28 14:12:49 -05:00
(Enter(Leaf(Heading { level: 1 })), "#"),
2022-11-28 18:33:43 -05:00
(Element(Inline), "2"),
(Exit(Leaf(Heading { level: 1 })), "#"),
2022-11-28 14:12:49 -05:00
(Element(Blankline), "\n"),
(Enter(Leaf(Heading { level: 1 })), "#"),
(Element(Inline), "8\n"),
2022-11-28 14:12:49 -05:00
(Element(Inline), " 12\n"),
2022-11-28 18:33:43 -05:00
(Element(Inline), "15"),
(Exit(Leaf(Heading { level: 1 })), "#"),
2022-11-12 12:45:17 -05:00
);
}
#[test]
2022-11-28 14:12:49 -05:00
fn parse_blockquote() {
test_parse!(
"> a\n",
(Enter(Container(Blockquote)), ">"),
(Enter(Leaf(Paragraph)), ""),
(Element(Inline), "a"),
(Exit(Leaf(Paragraph)), ""),
(Exit(Container(Blockquote)), ">"),
);
test_parse!(
"> \n",
(Enter(Container(Blockquote)), ">"),
(Element(Blankline), " \n"),
(Exit(Container(Blockquote)), ">"),
);
test_parse!(
">",
(Enter(Container(Blockquote)), ">"),
(Element(Blankline), ""),
(Exit(Container(Blockquote)), ">"),
);
2022-11-27 16:19:15 -05:00
test_parse!(
concat!(
"> a\n",
">\n",
"> ## hl\n",
">\n",
2022-11-28 14:12:49 -05:00
"> para\n", //
2022-11-27 16:19:15 -05:00
),
2022-11-28 14:12:49 -05:00
(Enter(Container(Blockquote)), ">"),
(Enter(Leaf(Paragraph)), ""),
2022-11-28 18:33:43 -05:00
(Element(Inline), "a"),
(Exit(Leaf(Paragraph)), ""),
(Element(Blankline), ""),
2022-11-28 14:12:49 -05:00
(Enter(Leaf(Heading { level: 2 })), "##"),
2022-11-28 18:33:43 -05:00
(Element(Inline), "hl"),
(Exit(Leaf(Heading { level: 2 })), "##"),
(Element(Blankline), ""),
2022-11-28 14:12:49 -05:00
(Enter(Leaf(Paragraph)), ""),
2022-11-28 18:33:43 -05:00
(Element(Inline), "para"),
(Exit(Leaf(Paragraph)), ""),
(Exit(Container(Blockquote)), ">"),
2022-11-12 12:45:17 -05:00
);
}
2022-12-02 14:07:37 -05:00
#[test]
fn parse_blockquote_empty() {
test_parse!(
"> \n",
(Enter(Container(Blockquote)), ">"),
(Element(Blankline), "\n"),
(Exit(Container(Blockquote)), ">"),
);
test_parse!(
">",
(Enter(Container(Blockquote)), ">"),
(Element(Blankline), ""),
(Exit(Container(Blockquote)), ">"),
);
}
2022-11-27 15:59:54 -05:00
#[test]
fn parse_code_block() {
2022-11-27 16:19:15 -05:00
test_parse!(
concat!(
2022-11-28 14:30:18 -05:00
"```\n",
2022-11-27 16:19:15 -05:00
"l0\n",
2022-12-02 14:07:37 -05:00
"```\n",
"\n",
"para\n", //
2022-11-27 16:19:15 -05:00
),
2022-11-28 14:30:18 -05:00
(Enter(Leaf(CodeBlock { fence_length: 3 })), "```"),
2022-11-28 18:33:43 -05:00
(Element(Inline), ""),
(Element(Inline), "l0\n"),
2022-11-28 18:33:43 -05:00
(Exit(Leaf(CodeBlock { fence_length: 3 })), "```"),
(Element(Blankline), "\n"),
(Enter(Leaf(Paragraph)), ""),
(Element(Inline), "para"),
(Exit(Leaf(Paragraph)), ""),
2022-11-28 14:30:18 -05:00
);
test_parse!(
concat!(
"```` lang\n",
2022-11-28 14:30:18 -05:00
"l0\n",
"```\n",
" l1\n",
"````", //
),
(Enter(Leaf(CodeBlock { fence_length: 4 })), "````"),
(Element(Inline), "lang\n"),
(Element(Inline), "l0\n"),
(Element(Inline), "```\n"),
(Element(Inline), " l1\n"),
2022-11-28 18:33:43 -05:00
(Exit(Leaf(CodeBlock { fence_length: 4 })), "````"),
2022-11-27 15:59:54 -05:00
);
test_parse!(
concat!(
"```\n", //
"a\n", //
"```\n", //
"```\n", //
"bbb\n", //
"```\n", //
),
(Enter(Leaf(CodeBlock { fence_length: 3 })), "```"),
(Element(Inline), ""),
(Element(Inline), "a\n"),
(Exit(Leaf(CodeBlock { fence_length: 3 })), "```"),
(Enter(Leaf(CodeBlock { fence_length: 3 })), "```"),
(Element(Inline), ""),
(Element(Inline), "bbb\n"),
(Exit(Leaf(CodeBlock { fence_length: 3 })), "```"),
);
2022-11-27 16:19:15 -05:00
}
2022-11-27 15:59:54 -05:00
2022-11-27 16:19:15 -05:00
macro_rules! test_block {
($src:expr, $kind:expr, $str:expr, $len:expr $(,)?) => {
let lines = super::lines($src).map(|sp| sp.of($src));
let (kind, sp, len) = Block::parse(lines).unwrap();
assert_eq!(
(kind, sp.of($src), len),
($kind, $str, $len),
"\n\n{}\n\n",
$src
);
};
2022-11-27 15:59:54 -05:00
}
2022-11-12 12:45:17 -05:00
#[test]
fn block_multiline() {
2022-11-27 16:19:15 -05:00
test_block!(
"# heading\n spanning two lines\n",
Block::Leaf(Heading { level: 1 }),
"#",
2
);
2022-11-12 12:45:17 -05:00
}
#[test]
2022-12-02 14:07:37 -05:00
fn block_blockquote() {
2022-11-27 16:19:15 -05:00
test_block!(
concat!(
"> a\n", //
">\n", //
" > b\n", //
">\n", //
"> c\n", //
),
Block::Container(Blockquote),
">",
5,
);
2022-11-12 12:45:17 -05:00
}
2022-12-02 14:07:37 -05:00
#[test]
fn block_code_block() {
2022-12-02 14:07:37 -05:00
test_block!(
concat!(
"```` lang\n",
"l0\n",
"```\n",
" l1\n",
"````", //
),
Block::Leaf(CodeBlock { fence_length: 4 }),
"````",
5,
);
test_block!(
concat!(
"```\n", //
"a\n", //
"```\n", //
"```\n", //
"bbb\n", //
"```\n", //
),
Block::Leaf(CodeBlock { fence_length: 3 }),
"```",
3,
);
test_block!(
concat!(
"``` no space in lang specifier\n",
"l0\n",
"```\n", //
),
Block::Leaf(Paragraph),
"",
3,
);
2022-12-02 14:07:37 -05:00
}
2022-11-12 12:45:17 -05:00
}