block wip

This commit is contained in:
Noah Hellman 2022-12-07 18:44:03 +01:00
parent 8a525f753b
commit c53988cc47

View file

@ -8,31 +8,59 @@ use Leaf::*;
pub type Tree = tree::Tree<Block, Atom>; pub type Tree = tree::Tree<Block, Atom>;
#[must_use]
pub fn parse(src: &str) -> Tree { pub fn parse(src: &str) -> Tree {
Parser::new(src).parse() Parser::new(src).parse()
} }
#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Block { pub enum Block {
/// A leaf block, containing only inline elements.
Leaf(Leaf), Leaf(Leaf),
/// A container block, containing children blocks.
Container(Container), Container(Container),
} }
#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Leaf { pub enum Leaf {
/// Span is empty, before first character of paragraph.
/// Each inline is a line.
Paragraph, Paragraph,
/// Span is `#` characters.
/// Each inline is a line.
Heading { level: u8 }, Heading { level: u8 },
/// Span is first `|` character.
/// Each inline is a line (row).
Table, Table,
/// Span is the link tag.
/// Inlines are lines of the URL.
LinkDefinition, LinkDefinition,
CodeBlock { fence_length: u8 },
/// Span is language specifier.
/// Each inline is a line.
CodeBlock { fence_length: u8, c: u8 },
/// Span is from first to last character.
/// No inlines.
ThematicBreak, ThematicBreak,
} }
#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Container { pub enum Container {
/// Span is first `>` character.
Blockquote, Blockquote,
/// Span is class specifier.
Div { fence_length: u8 }, Div { fence_length: u8 },
/// Span is the list marker.
ListItem { indent: u8 }, ListItem { indent: u8 },
/// Span is first `[^` instance.
Footnote { indent: u8 }, Footnote { indent: u8 },
} }
@ -40,8 +68,10 @@ pub enum Container {
pub enum Atom { pub enum Atom {
/// Inline content with unparsed inline elements. /// Inline content with unparsed inline elements.
Inline, Inline,
/// A line with no non-whitespace characters. /// A line with no non-whitespace characters.
Blankline, Blankline,
/// A list of attributes. /// A list of attributes.
Attributes, Attributes,
} }
@ -84,30 +114,64 @@ impl<'s> Parser<'s> {
}) })
.count(); .count();
let lines = &mut lines[blanklines..]; let lines = &mut lines[blanklines..];
Block::parse(lines.iter().map(|sp| sp.of(self.src))).map_or( Block::parse(lines.iter().map(|sp| sp.of(self.src))).map_or(
blanklines, blanklines,
|(kind, span, line_count)| { |(kind, span, line_count)| {
let start = lines.get(0).map(|sp| sp.start()).unwrap(); let lines = {
let span = span.translate(start); let l = lines.len().min(line_count);
&mut lines[..l]
};
let truncated = lines.len() < line_count;
let span = span.translate(lines[0].start());
// skip part of first inline that is shared with the block span
lines[0] = lines[0].with_start(span.end());
// remove junk from footnotes / link defs
if matches!(
kind,
Block::Leaf(LinkDefinition) | Block::Container(Footnote { .. })
) {
assert_eq!(&lines[0].of(self.src).chars().as_str()[0..2], "]:");
lines[0] = lines[0].skip(2);
}
// skip closing fence of code blocks / divs
let lines = if !truncated
&& matches!(
kind,
Block::Leaf(CodeBlock { .. }) | Block::Container(Div { .. })
) {
let l = lines.len();
&mut lines[..l - 1]
} else {
lines
};
match &kind { match &kind {
Block::Leaf(l) => { Block::Leaf(l) => {
self.tree.enter(kind, span); self.tree.enter(kind, span);
let first = &mut lines[0];
*first = first.with_start(span.end()); // trim starting whitespace of the block contents
// trim starting whitespace of block lines[0] = lines[0].trim_start(self.src);
*first = first.trim_start(self.src);
let line_count = match l { // skip first inline if empty (e.g. code block)
CodeBlock { .. } => line_count - 1, let lines = if lines[0].is_empty() {
_ => line_count, &mut lines[1..]
} else {
lines
}; };
// trim ending whitespace of block if not verbatim
if !matches!(l, Leaf::CodeBlock { .. }) { if !matches!(l, Leaf::CodeBlock { .. }) {
// trim ending whitespace of block
let last = &mut lines[line_count - 1]; let last = &mut lines[line_count - 1];
*last = last.trim_end(self.src); *last = last.trim_end(self.src);
} }
for line in &lines[0..line_count] {
self.tree.elem(Atom::Inline, *line); lines
} .iter()
.for_each(|line| self.tree.elem(Atom::Inline, *line));
} }
Block::Container(c) => { Block::Container(c) => {
let (skip_chars, skip_lines_suffix) = match &c { let (skip_chars, skip_lines_suffix) = match &c {
@ -118,7 +182,6 @@ impl<'s> Parser<'s> {
let line_count_inner = lines.len() - skip_lines_suffix; let line_count_inner = lines.len() - skip_lines_suffix;
// update spans, remove indentation / container prefix // update spans, remove indentation / container prefix
lines[0] = lines[0].with_start(span.end());
lines lines
.iter_mut() .iter_mut()
.skip(1) .skip(1)
@ -166,15 +229,15 @@ impl Block {
/// Determine what type of block a line can start. /// Determine what type of block a line can start.
fn start(line: &str) -> (Self, Span) { fn start(line: &str) -> (Self, Span) {
let start = line.chars().take_while(|c| c.is_whitespace()).count(); let start = line.chars().take_while(|c| c.is_whitespace()).count();
let line = &line[start..]; let line_t = &line[start..];
let mut chars = line.chars(); let mut chars = line_t.chars();
match chars.next().unwrap_or(EOF) { match chars.next().unwrap_or(EOF) {
'#' => chars '#' => chars
.find(|c| *c != '#') .find(|c| *c != '#')
.map_or(true, char::is_whitespace) .map_or(true, char::is_whitespace)
.then(|| { .then(|| {
u8::try_from(line.len() - chars.as_str().len() - 1) u8::try_from(line_t.len() - chars.as_str().len() - 1)
.ok() .ok()
.map(|level| { .map(|level| {
( (
@ -189,26 +252,27 @@ impl Block {
c.is_whitespace().then(|| { c.is_whitespace().then(|| {
( (
Self::Container(Blockquote), Self::Container(Blockquote),
Span::by_len(start, line.len() - chars.as_str().len() - 1), Span::by_len(start, line_t.len() - chars.as_str().len() - 1),
) )
}) })
} else { } else {
Some(( Some((
Self::Container(Blockquote), Self::Container(Blockquote),
Span::by_len(start, line.len() - chars.as_str().len()), Span::by_len(start, line_t.len() - chars.as_str().len()),
)) ))
} }
} }
'|' => (&line[line.len() - 1..] == "|" '|' => (&line_t[line_t.len() - 1..] == "|"
&& &line[line.len() - 2..line.len() - 1] != "\\") && &line_t[line_t.len() - 2..line_t.len() - 1] != "\\")
.then(|| (Self::Leaf(Table), Span::by_len(start, 1))), .then(|| (Self::Leaf(Table), Span::by_len(start, 1))),
'[' => { '[' => chars.as_str().find("]:").map(|l| {
let first = chars.next(); let tag = &chars.as_str()[0..l];
let is_footnote = chars.next() == Some('^'); let (tag, is_footnote) = if let Some(tag) = tag.strip_prefix('^') {
if first != Some(']') { (tag, true)
(&mut chars).take_while(|c| *c != ']').count(); } else {
} (tag, false)
(chars.next() == Some(':')).then(|| { };
dbg!(line, line_t, tag);
( (
if is_footnote { if is_footnote {
Self::Container(Footnote { Self::Container(Footnote {
@ -217,13 +281,13 @@ impl Block {
} else { } else {
Self::Leaf(LinkDefinition) Self::Leaf(LinkDefinition)
}, },
Span::by_len(start, 0), Span::from_slice(line, tag),
) )
}) }),
} '-' | '*' if Self::is_thematic_break(chars.clone()) => Some((
'-' | '*' if Self::is_thematic_break(chars.clone()) => { Self::Leaf(ThematicBreak),
Some((Self::Leaf(ThematicBreak), Span::by_len(start, line.len()))) Span::from_slice(line, line_t.trim()),
} )),
'-' => chars.next().map_or(true, char::is_whitespace).then(|| { '-' => chars.next().map_or(true, char::is_whitespace).then(|| {
let task_list = chars.next() == Some('[') let task_list = chars.next() == Some('[')
&& matches!(chars.next(), Some('X' | ' ')) && matches!(chars.next(), Some('X' | ' '))
@ -242,19 +306,22 @@ impl Block {
}), }),
Span::by_len(start, 1), Span::by_len(start, 1),
)), )),
f @ ('`' | ':') => { f @ ('`' | ':' | '~') => {
let fence_length = (&mut chars).take_while(|c| *c == f).count() + 1; let fence_length = (&mut chars).take_while(|c| *c == f).count() + 1;
let valid_spec = !line[fence_length..].trim().chars().any(char::is_whitespace); let lang = line_t[fence_length..].trim();
let valid_spec = !lang.chars().any(char::is_whitespace);
(valid_spec && fence_length >= 3) (valid_spec && fence_length >= 3)
.then(|| { .then(|| {
u8::try_from(fence_length).ok().map(|fence_length| { u8::try_from(fence_length).ok().map(|fence_length| {
( (
match f { match f {
'`' => Self::Leaf(CodeBlock { fence_length }),
':' => Self::Container(Div { fence_length }), ':' => Self::Container(Div { fence_length }),
_ => unreachable!(), _ => Self::Leaf(CodeBlock {
fence_length,
c: f as u8,
}),
}, },
Span::by_len(start, fence_length.into()), Span::from_slice(line, lang),
) )
}) })
}) })
@ -289,11 +356,11 @@ impl Block {
let spaces = line.chars().take_while(|c| c.is_whitespace()).count(); let spaces = line.chars().take_while(|c| c.is_whitespace()).count();
!line.trim().is_empty() && spaces >= (indent).into() !line.trim().is_empty() && spaces >= (indent).into()
} }
Self::Container(Div { fence_length }) | Self::Leaf(CodeBlock { fence_length }) => { Self::Container(Div { fence_length }) | Self::Leaf(CodeBlock { fence_length, .. }) => {
let fence = if matches!(self, Self::Container(..)) { let fence = match self {
':' Self::Container(..) => ':',
} else { Self::Leaf(CodeBlock { c, .. }) => c as char,
'`' Self::Leaf(..) => unreachable!(),
}; };
let mut c = line.chars(); let mut c = line.chars();
!((&mut c).take((fence_length).into()).all(|c| c == fence) !((&mut c).take((fence_length).into()).all(|c| c == fence)
@ -463,6 +530,24 @@ mod test {
#[test] #[test]
fn parse_code_block() { fn parse_code_block() {
test_parse!(
concat!("```\n", "l0\n"),
(
Enter(Leaf(CodeBlock {
fence_length: 3,
c: b'`'
})),
"",
),
(Element(Inline), "l0\n"),
(
Exit(Leaf(CodeBlock {
fence_length: 3,
c: b'`'
})),
"",
),
);
test_parse!( test_parse!(
concat!( concat!(
"```\n", "```\n",
@ -471,10 +556,21 @@ mod test {
"\n", "\n",
"para\n", // "para\n", //
), ),
(Enter(Leaf(CodeBlock { fence_length: 3 })), "```"), (
(Element(Inline), ""), Enter(Leaf(CodeBlock {
fence_length: 3,
c: b'`'
})),
""
),
(Element(Inline), "l0\n"), (Element(Inline), "l0\n"),
(Exit(Leaf(CodeBlock { fence_length: 3 })), "```"), (
Exit(Leaf(CodeBlock {
fence_length: 3,
c: b'`'
})),
""
),
(Element(Blankline), "\n"), (Element(Blankline), "\n"),
(Enter(Leaf(Paragraph)), ""), (Enter(Leaf(Paragraph)), ""),
(Element(Inline), "para"), (Element(Inline), "para"),
@ -488,12 +584,23 @@ mod test {
" l1\n", " l1\n",
"````", // "````", //
), ),
(Enter(Leaf(CodeBlock { fence_length: 4 })), "````"), (
(Element(Inline), "lang\n"), Enter(Leaf(CodeBlock {
fence_length: 4,
c: b'`'
})),
"lang"
),
(Element(Inline), "l0\n"), (Element(Inline), "l0\n"),
(Element(Inline), "```\n"), (Element(Inline), "```\n"),
(Element(Inline), " l1\n"), (Element(Inline), " l1\n"),
(Exit(Leaf(CodeBlock { fence_length: 4 })), "````"), (
Exit(Leaf(CodeBlock {
fence_length: 4,
c: b'`'
})),
"lang"
),
); );
test_parse!( test_parse!(
concat!( concat!(
@ -504,14 +611,82 @@ mod test {
"bbb\n", // "bbb\n", //
"```\n", // "```\n", //
), ),
(Enter(Leaf(CodeBlock { fence_length: 3 })), "```"), (
(Element(Inline), ""), Enter(Leaf(CodeBlock {
fence_length: 3,
c: b'`'
})),
""
),
(Element(Inline), "a\n"), (Element(Inline), "a\n"),
(Exit(Leaf(CodeBlock { fence_length: 3 })), "```"), (
(Enter(Leaf(CodeBlock { fence_length: 3 })), "```"), Exit(Leaf(CodeBlock {
(Element(Inline), ""), fence_length: 3,
c: b'`'
})),
""
),
(
Enter(Leaf(CodeBlock {
fence_length: 3,
c: b'`'
})),
""
),
(Element(Inline), "bbb\n"), (Element(Inline), "bbb\n"),
(Exit(Leaf(CodeBlock { fence_length: 3 })), "```"), (
Exit(Leaf(CodeBlock {
fence_length: 3,
c: b'`'
})),
""
),
);
test_parse!(
concat!(
"~~~\n",
"code\n",
" block\n",
"~~~\n", //
),
(
Enter(Leaf(CodeBlock {
fence_length: 3,
c: b'~'
})),
"",
),
(Element(Inline), "code\n"),
(Element(Inline), " block\n"),
(
Exit(Leaf(CodeBlock {
fence_length: 3,
c: b'~'
})),
"",
),
);
}
#[test]
fn parse_link_definition() {
test_parse!(
"[tag]: url\n",
(Enter(Leaf(LinkDefinition)), "tag"),
(Element(Inline), "url"),
(Exit(Leaf(LinkDefinition)), "tag"),
);
}
#[test]
fn parse_footnote() {
test_parse!(
"[^tag]: description\n",
(Enter(Container(Footnote { indent: 0 })), "tag"),
(Enter(Leaf(Paragraph)), ""),
(Element(Inline), "description"),
(Exit(Leaf(Paragraph)), ""),
(Exit(Container(Footnote { indent: 0 })), "tag"),
); );
} }
@ -532,7 +707,7 @@ mod test {
fn block_multiline() { fn block_multiline() {
test_block!( test_block!(
"# heading\n spanning two lines\n", "# heading\n spanning two lines\n",
Block::Leaf(Heading { level: 1 }), Leaf(Heading { level: 1 }),
"#", "#",
2 2
); );
@ -564,8 +739,11 @@ mod test {
" l1\n", " l1\n",
"````", // "````", //
), ),
Block::Leaf(CodeBlock { fence_length: 4 }), Leaf(CodeBlock {
"````", fence_length: 4,
c: b'`'
}),
"lang",
5, 5,
); );
test_block!( test_block!(
@ -577,8 +755,11 @@ mod test {
"bbb\n", // "bbb\n", //
"```\n", // "```\n", //
), ),
Block::Leaf(CodeBlock { fence_length: 3 }), Leaf(CodeBlock {
"```", fence_length: 3,
c: b'`'
}),
"",
3, 3,
); );
test_block!( test_block!(
@ -587,7 +768,7 @@ mod test {
"l0\n", "l0\n",
"```\n", // "```\n", //
), ),
Block::Leaf(Paragraph), Leaf(Paragraph),
"", "",
3, 3,
); );
@ -595,14 +776,14 @@ mod test {
#[test] #[test]
fn block_link_definition() { fn block_link_definition() {
test_block!("[tag]: url\n", Block::Leaf(LinkDefinition), "", 1); test_block!("[tag]: url\n", Leaf(LinkDefinition), "tag", 1);
test_block!( test_block!(
concat!( concat!(
"[tag]: uuu\n", "[tag]: uuu\n",
" rl\n", // " rl\n", //
), ),
Block::Leaf(LinkDefinition), Leaf(LinkDefinition),
"", "tag",
2, 2,
); );
test_block!( test_block!(
@ -610,8 +791,8 @@ mod test {
"[tag]: url\n", "[tag]: url\n",
"para\n", // "para\n", //
), ),
Block::Leaf(LinkDefinition), Leaf(LinkDefinition),
"", "tag",
1, 1,
); );
} }