From c288264aee3080e819b5c06a156ccf869cb26907 Mon Sep 17 00:00:00 2001
From: Noah Hellman
Date: Wed, 25 Jan 2023 19:27:12 +0100
Subject: [PATCH] block: parse tables
---
src/block.rs | 255 +++++++++++++++++++++++++++++++++++++++++++++++++--
src/html.rs | 8 +-
src/lib.rs | 72 +++++++++------
src/tree.rs | 62 ++++++++++++-
4 files changed, 355 insertions(+), 42 deletions(-)
diff --git a/src/block.rs b/src/block.rs
index 7dc40ec..241dcb3 100644
--- a/src/block.rs
+++ b/src/block.rs
@@ -1,9 +1,11 @@
+use crate::Alignment;
use crate::OrderedListNumbering::*;
use crate::OrderedListStyle::*;
use crate::Span;
use crate::EOF;
use crate::attr;
+use crate::lex;
use crate::tree;
use Atom::*;
@@ -59,9 +61,9 @@ pub enum Leaf {
/// Each inline is a line.
Heading,
- /// Span is first `|` character.
- /// Each inline is a line (row).
- Table,
+ /// Span is '|'.
+ /// Has zero or one inline for the cell contents.
+ TableCell(Alignment),
/// Span is the link tag.
/// Inlines are lines of the URL.
@@ -91,6 +93,12 @@ pub enum Container {
/// Span is footnote tag.
Footnote,
+
+ /// Span is empty, before first '|' character.
+ Table,
+
+ /// Span is first '|' character.
+ TableRow { head: bool },
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -121,6 +129,8 @@ struct TreeParser<'s> {
prev_blankline: bool,
/// Stack of currently open lists.
open_lists: Vec,
+ /// Alignments for each column in for the current table.
+ alignments: Vec,
}
impl<'s> TreeParser<'s> {
@@ -131,6 +141,7 @@ impl<'s> TreeParser<'s> {
tree: TreeBuilder::new(),
prev_blankline: false,
open_lists: Vec::new(),
+ alignments: Vec::new(),
}
}
@@ -253,6 +264,134 @@ impl<'s> TreeParser<'s> {
lines.iter().for_each(|line| self.tree.inline(*line));
self.tree.exit();
}
+ Block::Container(Table) => {
+ self.alignments.clear();
+ self.tree.enter(Node::Container(Table), span);
+ let mut last_row_node = None;
+ for row in lines {
+ let row_node = self
+ .tree
+ .enter(Node::Container(TableRow { head: false }), row.with_len(1));
+ let rem = row.skip(1);
+ let lex = lex::Lexer::new(row.skip(1).of(self.src).chars());
+ let mut pos = rem.start();
+ let mut cell_start = pos;
+ let mut separator_row = true;
+ let mut verbatim = None;
+ let mut column_index = 0;
+ for lex::Token { kind, len } in lex {
+ if let Some(l) = verbatim {
+ if matches!(kind, lex::Kind::Seq(lex::Sequence::Backtick))
+ && len == l
+ {
+ verbatim = None;
+ }
+ } else {
+ match kind {
+ lex::Kind::Sym(lex::Symbol::Pipe) => {
+ {
+ let span =
+ Span::new(cell_start, pos).trim(self.src);
+ let cell = span.of(self.src);
+ let separator_cell = match cell.len() {
+ 0 => false,
+ 1 => cell == "-",
+ 2 => matches!(cell, ":-" | "--" | "-:"),
+ l => {
+ matches!(cell.as_bytes()[0], b'-' | b':')
+ && matches!(
+ cell.as_bytes()[l - 1],
+ b'-' | b':'
+ )
+ && cell
+ .chars()
+ .skip(1)
+ .take(l - 2)
+ .all(|c| c == '-')
+ }
+ };
+ separator_row &= separator_cell;
+ self.tree.enter(
+ Node::Leaf(TableCell(
+ self.alignments
+ .get(column_index)
+ .copied()
+ .unwrap_or(Alignment::Unspecified),
+ )),
+ Span::by_len(cell_start - 1, 1),
+ );
+ self.tree.inline(span);
+ self.tree.exit(); // cell
+ cell_start = pos + len;
+ column_index += 1;
+ }
+ }
+ lex::Kind::Seq(lex::Sequence::Backtick) => {
+ verbatim = Some(len);
+ }
+ _ => {}
+ }
+ }
+ pos += len;
+ }
+ if separator_row {
+ self.alignments.clear();
+ self.alignments.extend(
+ self.tree
+ .children(row_node)
+ .filter(|(kind, _)| matches!(kind, tree::Element::Inline))
+ .map(|(_, sp)| {
+ let cell = sp.of(self.src);
+ let l = cell.as_bytes()[0] == b':';
+ let r = cell.as_bytes()[cell.len() - 1] == b':';
+ match (l, r) {
+ (false, false) => Alignment::Unspecified,
+ (false, true) => Alignment::Right,
+ (true, false) => Alignment::Left,
+ (true, true) => Alignment::Center,
+ }
+ }),
+ );
+ self.tree.exit_discard(); // table row
+ if let Some(head_row) = last_row_node {
+ self.tree
+ .children(head_row)
+ .filter(|(e, _sp)| {
+ matches!(
+ e,
+ tree::Element::Container(Node::Leaf(TableCell(..)))
+ )
+ })
+ .zip(
+ self.alignments
+ .iter()
+ .copied()
+ .chain(std::iter::repeat(Alignment::Unspecified)),
+ )
+ .for_each(|((e, _), new_align)| {
+ if let tree::Element::Container(Node::Leaf(
+ TableCell(alignment),
+ )) = e
+ {
+ *alignment = new_align;
+ }
+ });
+ if let tree::Element::Container(Node::Container(TableRow {
+ head,
+ })) = self.tree.elem(head_row)
+ {
+ *head = true;
+ } else {
+ panic!()
+ }
+ }
+ } else {
+ self.tree.exit(); // table row
+ }
+ last_row_node = Some(row_node);
+ }
+ self.tree.exit(); // table
+ }
Block::Container(c) => {
let line_count_inner = lines.len() - usize::from(matches!(c, Div));
@@ -270,7 +409,9 @@ impl<'s> TreeParser<'s> {
let skip = match c {
Blockquote => spaces + "> ".len(),
ListItem(..) | Footnote | Div => spaces.min(indent),
- List { .. } | DescriptionList => panic!(),
+ List { .. } | DescriptionList | Table | TableRow { .. } => {
+ panic!()
+ }
};
let len = sp.len() - usize::from(sp.of(self.src).ends_with('\n'));
*sp = sp.skip(skip.min(len));
@@ -381,9 +522,12 @@ impl BlockParser {
}
'{' => (attr::valid(line_t.chars()).0 == line_t.trim_end().len())
.then(|| (Block::Atom(Attributes), Span::by_len(start, line_t.len()))),
- '|' => (&line_t[line_t.len() - 1..] == "|"
- && &line_t[line_t.len() - 2..line_t.len() - 1] != "\\")
- .then(|| (Block::Leaf(Table), Span::by_len(start, 1))),
+ '|' => {
+ let l = line_t.trim_end().len();
+ // FIXME: last byte may be pipe but end of prefixed unicode char
+ (line_t.as_bytes()[l - 1] == b'|' && line_t.as_bytes()[l - 2] != b'\\')
+ .then(|| (Block::Container(Table), Span::empty_at(start)))
+ }
'[' => chars.as_str().find("]:").map(|l| {
let tag = &chars.as_str()[0..l];
let (tag, is_footnote) = if let Some(tag) = tag.strip_prefix('^') {
@@ -472,7 +616,7 @@ impl BlockParser {
let empty = line_t.is_empty();
match self.kind {
Block::Atom(..) => false,
- Block::Leaf(Paragraph | Heading | Table) => !line.trim().is_empty(),
+ Block::Leaf(Paragraph | Heading) => !line.trim().is_empty(),
Block::Leaf(LinkDefinition) => line.starts_with(' ') && !line.trim().is_empty(),
Block::Container(Blockquote) => line.trim().starts_with('>'),
Block::Container(ListItem(..)) => {
@@ -494,7 +638,15 @@ impl BlockParser {
!((&mut c).take(fence_length).all(|c| c == fence)
&& c.next().map_or(true, char::is_whitespace))
}
- Block::Container(List { .. } | DescriptionList) => panic!(),
+ Block::Container(List { .. } | DescriptionList | TableRow { .. })
+ | Block::Leaf(TableCell(..)) => {
+ panic!()
+ }
+ Block::Container(Table) => {
+ let line = line.trim();
+ let l = line.len();
+ line.as_bytes()[l - 1] == b'|' && line.as_bytes()[l - 2] != b'\\'
+ }
}
}
@@ -615,6 +767,7 @@ fn lines(src: &str) -> impl Iterator- + '_ {
mod test {
use crate::tree::EventKind;
use crate::tree::EventKind::*;
+ use crate::Alignment;
use crate::OrderedListNumbering::*;
use crate::OrderedListStyle::*;
@@ -1242,6 +1395,90 @@ mod test {
);
}
+ #[test]
+ fn parse_table() {
+ test_parse!(
+ concat!(
+ "|a|b|c|\n", //
+ "|-|-|-|\n", //
+ "|1|2|3|\n", //
+ ),
+ (Enter(Container(Table)), ""),
+ (Enter(Container(TableRow { head: true })), "|"),
+ (Enter(Leaf(TableCell(Alignment::Unspecified))), "|"),
+ (Inline, "a"),
+ (Exit(Leaf(TableCell(Alignment::Unspecified))), "|"),
+ (Enter(Leaf(TableCell(Alignment::Unspecified))), "|"),
+ (Inline, "b"),
+ (Exit(Leaf(TableCell(Alignment::Unspecified))), "|"),
+ (Enter(Leaf(TableCell(Alignment::Unspecified))), "|"),
+ (Inline, "c"),
+ (Exit(Leaf(TableCell(Alignment::Unspecified))), "|"),
+ (Exit(Container(TableRow { head: true })), "|"),
+ (Enter(Container(TableRow { head: false })), "|"),
+ (Enter(Leaf(TableCell(Alignment::Unspecified))), "|"),
+ (Inline, "1"),
+ (Exit(Leaf(TableCell(Alignment::Unspecified))), "|"),
+ (Enter(Leaf(TableCell(Alignment::Unspecified))), "|"),
+ (Inline, "2"),
+ (Exit(Leaf(TableCell(Alignment::Unspecified))), "|"),
+ (Enter(Leaf(TableCell(Alignment::Unspecified))), "|"),
+ (Inline, "3"),
+ (Exit(Leaf(TableCell(Alignment::Unspecified))), "|"),
+ (Exit(Container(TableRow { head: false })), "|"),
+ (Exit(Container(Table)), "")
+ );
+ }
+
+ #[test]
+ fn parse_table_post() {
+ test_parse!(
+ "|a|\npara",
+ (Enter(Container(Table)), ""),
+ (Enter(Container(TableRow { head: false })), "|"),
+ (Enter(Leaf(TableCell(Alignment::Unspecified))), "|"),
+ (Inline, "a"),
+ (Exit(Leaf(TableCell(Alignment::Unspecified))), "|"),
+ (Exit(Container(TableRow { head: false })), "|"),
+ (Exit(Container(Table)), ""),
+ (Enter(Leaf(Paragraph)), ""),
+ (Inline, "para"),
+ (Exit(Leaf(Paragraph)), ""),
+ );
+ }
+
+ #[test]
+ fn parse_table_align() {
+ test_parse!(
+ concat!(
+ "|:---|:----:|----:|\n",
+ "|left|center|right|\n", //
+ ),
+ (Enter(Container(Table)), ""),
+ (Enter(Container(TableRow { head: false })), "|"),
+ (Enter(Leaf(TableCell(Alignment::Left))), "|"),
+ (Inline, "left"),
+ (Exit(Leaf(TableCell(Alignment::Left))), "|"),
+ (Enter(Leaf(TableCell(Alignment::Center))), "|"),
+ (Inline, "center"),
+ (Exit(Leaf(TableCell(Alignment::Center))), "|"),
+ (Enter(Leaf(TableCell(Alignment::Right))), "|"),
+ (Inline, "right"),
+ (Exit(Leaf(TableCell(Alignment::Right))), "|"),
+ (Exit(Container(TableRow { head: false })), "|"),
+ (Exit(Container(Table)), "")
+ );
+ }
+
+ #[test]
+ fn parse_table_sep_row_only() {
+ test_parse!(
+ "|-|-|",
+ (Enter(Container(Table)), ""),
+ (Exit(Container(Table)), "")
+ );
+ }
+
macro_rules! test_block {
($src:expr, $kind:expr, $str:expr, $len:expr $(,)?) => {
let lines = super::lines($src).map(|sp| sp.of($src));
diff --git a/src/html.rs b/src/html.rs
index 552be8d..ab10330 100644
--- a/src/html.rs
+++ b/src/html.rs
@@ -145,7 +145,7 @@ impl<'s, I: Iterator
- >, W: std::fmt::Write> Writer<'s, I, W> {
continue;
}
Container::Table => self.out.write_str("
self.out.write_str(" self.out.write_str("
self.out.write_str(" {
if matches!(self.list_tightness.last(), Some(true)) {
@@ -154,7 +154,7 @@ impl<'s, I: Iterator
- >, W: std::fmt::Write> Writer<'s, I, W> {
self.out.write_str("
write!(self.out, " self.out.write_str(" self.out.write_str(" | self.out.write_str(" self.out.write_str(" self.out.write_str(">, W: std::fmt::Write> Writer<'s, I, W> {
self.footnote_number = None;
}
Container::Table => self.out.write_str(" |
")?,
- Container::TableRow => self.out.write_str("")?,
+ Container::TableRow { .. } => self.out.write_str("")?,
Container::Div { .. } => self.out.write_str("")?,
Container::Paragraph => {
if matches!(self.list_tightness.last(), Some(true)) {
@@ -323,7 +323,7 @@ impl<'s, I: Iterator- >, W: std::fmt::Write> Writer<'s, I, W> {
self.out.write_str("
")?;
}
Container::Heading { level } => write!(self.out, "", level)?,
- Container::TableCell => self.out.write_str("")?,
+ Container::TableCell { .. } => self.out.write_str("")?,
Container::DescriptionTerm => self.out.write_str("")?,
Container::CodeBlock { .. } => self.out.write_str("")?,
Container::Span => self.out.write_str("")?,
diff --git a/src/lib.rs b/src/lib.rs
index 70a23c6..469f9c9 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -47,7 +47,7 @@ pub enum Container<'s> {
/// A table element.
Table,
/// A row element of a table.
- TableRow,
+ TableRow { head: bool },
/// A block-level divider element.
Div { class: Option<&'s str> },
/// A paragraph.
@@ -55,7 +55,7 @@ pub enum Container<'s> {
/// A heading.
Heading { level: usize },
/// A cell element of row within a table.
- TableCell,
+ TableCell { alignment: Alignment, head: bool },
/// A term within a description list.
DescriptionTerm,
/// A block with raw markup for a specific output format.
@@ -106,12 +106,12 @@ impl<'s> Container<'s> {
| Self::DescriptionDetails
| Self::Footnote { .. }
| Self::Table
- | Self::TableRow
+ | Self::TableRow { .. }
| Self::Div { .. }
| Self::Paragraph
| Self::Heading { .. }
+ | Self::TableCell { .. }
| Self::DescriptionTerm
- | Self::TableCell
| Self::RawBlock { .. }
| Self::CodeBlock { .. } => true,
Self::Span
@@ -143,11 +143,11 @@ impl<'s> Container<'s> {
| Self::DescriptionDetails
| Self::Footnote { .. }
| Self::Table
- | Self::TableRow
+ | Self::TableRow { .. }
| Self::Div { .. } => true,
Self::Paragraph
| Self::Heading { .. }
- | Self::TableCell
+ | Self::TableCell { .. }
| Self::DescriptionTerm
| Self::RawBlock { .. }
| Self::CodeBlock { .. }
@@ -170,6 +170,14 @@ impl<'s> Container<'s> {
}
}
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Alignment {
+ Unspecified,
+ Left,
+ Center,
+ Right,
+}
+
#[derive(Debug, PartialEq, Eq)]
pub enum SpanLinkType {
Inline,
@@ -242,27 +250,6 @@ pub enum Atom<'s> {
Blankline,
}
-impl<'s> Container<'s> {
- fn from_leaf_block(content: &'s str, l: block::Leaf) -> Self {
- match l {
- block::Leaf::Paragraph => Self::Paragraph,
- block::Leaf::Heading => Self::Heading {
- level: content.len(),
- },
- block::Leaf::CodeBlock => {
- if let Some(format) = content.strip_prefix('=') {
- Self::RawBlock { format }
- } else {
- Self::CodeBlock {
- lang: (!content.is_empty()).then(|| content),
- }
- }
- }
- _ => todo!(),
- }
- }
-}
-
impl OrderedListNumbering {
fn parse_number(self, n: &str) -> u32 {
match self {
@@ -336,6 +323,8 @@ pub struct Parser<'s> {
/// Inline parser, recreated for each new inline.
inline_parser: Option>>,
+ table_head_row: bool,
+
/// Footnote references in the order they were encountered, without duplicates.
footnote_references: Vec<&'s str>,
/// Cache of footnotes to emit at the end.
@@ -376,6 +365,7 @@ impl<'s> Parser<'s> {
src,
link_definitions,
tree: branch,
+ table_head_row: false,
footnote_references: Vec::new(),
footnotes: std::collections::HashMap::new(),
footnote_index: 0,
@@ -533,7 +523,26 @@ impl<'s> Parser<'s> {
self.inline_parser =
Some(inline::Parser::new(self.inlines.chars()));
}
- Container::from_leaf_block(content, l)
+ match l {
+ block::Leaf::Paragraph => Container::Paragraph,
+ block::Leaf::Heading => Container::Heading {
+ level: content.len(),
+ },
+ block::Leaf::CodeBlock => {
+ if let Some(format) = content.strip_prefix('=') {
+ Container::RawBlock { format }
+ } else {
+ Container::CodeBlock {
+ lang: (!content.is_empty()).then(|| content),
+ }
+ }
+ }
+ block::Leaf::TableCell(alignment) => Container::TableCell {
+ alignment,
+ head: self.table_head_row,
+ },
+ block::Leaf::LinkDefinition => unreachable!(),
+ }
}
block::Node::Container(c) => match c {
block::Container::Blockquote => Container::Blockquote,
@@ -573,6 +582,13 @@ impl<'s> Parser<'s> {
Container::ListItem
}
}
+ block::Container::Table => Container::Table,
+ block::Container::TableRow { head } => {
+ if enter {
+ self.table_head_row = head;
+ }
+ Container::TableRow { head }
+ }
},
};
if enter {
diff --git a/src/tree.rs b/src/tree.rs
index 3266570..de049cc 100644
--- a/src/tree.rs
+++ b/src/tree.rs
@@ -157,7 +157,18 @@ pub struct Builder {
depth: usize,
}
-impl Builder {
+impl<'a, C, A> From<&'a mut NodeKind> for Element<'a, C, A> {
+ fn from(kind: &'a mut NodeKind) -> Self {
+ match kind {
+ NodeKind::Root => unreachable!(),
+ NodeKind::Container(c, ..) => Element::Container(c),
+ NodeKind::Atom(a) => Element::Atom(a),
+ NodeKind::Inline => Element::Inline,
+ }
+ }
+}
+
+impl Builder {
pub(super) fn new() -> Self {
Builder {
nodes: vec![Node {
@@ -206,6 +217,19 @@ impl Builder {
}
}
+ /// Exit and discard all the contents of the current container.
+ pub(super) fn exit_discard(&mut self) {
+ self.exit();
+ let exited = self.branch.pop().unwrap();
+ self.nodes.drain(exited.index()..);
+ let (ni, has_parent) = self.relink(exited, None);
+ if has_parent {
+ self.head = Some(ni);
+ } else {
+ self.branch.push(ni);
+ }
+ }
+
pub(super) fn depth(&self) -> usize {
self.depth
}
@@ -219,6 +243,23 @@ impl Builder {
}
}
+ /// Retrieve all children nodes for the specified node. Order is in the order they were added.
+ pub(super) fn children(
+ &mut self,
+ node: NodeIndex,
+ ) -> impl Iterator- , Span)> {
+ assert!(matches!(
+ self.nodes[node.index()].kind,
+ NodeKind::Container(..)
+ ));
+ let end = self.nodes[node.index()]
+ .next
+ .map_or(self.nodes.len(), NodeIndex::index);
+ self.nodes[node.index()..end]
+ .iter_mut()
+ .map(|n| (Element::from(&mut n.kind), n.span))
+ }
+
pub(super) fn finish(self) -> Tree {
assert_eq!(self.depth, 0);
let head = self.nodes[NodeIndex::root().index()].next;
@@ -257,6 +298,25 @@ impl Builder {
self.head = Some(ni);
ni
}
+
+ /// Remove the link from the node that points to the specified node. Return the pointer node
+ /// and whether it is a container or not.
+ fn relink(&mut self, prev: NodeIndex, next: Option) -> (NodeIndex, bool) {
+ for (i, n) in self.nodes.iter_mut().enumerate().rev() {
+ let ni = NodeIndex::new(i);
+ if n.next == Some(prev) {
+ n.next = next;
+ return (ni, false);
+ } else if let NodeKind::Container(kind, child) = &mut n.kind {
+ if *child == Some(prev) {
+ dbg!(kind, next);
+ *child = next;
+ return (ni, true);
+ }
+ }
+ }
+ panic!()
+ }
}
impl std::fmt::Debug