diff --git a/src/attr.rs b/src/attr.rs index 8a7bfa6..63079f6 100644 --- a/src/attr.rs +++ b/src/attr.rs @@ -100,6 +100,11 @@ impl<'s> Attributes<'s> { } } + #[must_use] + pub fn get(&self, key: &str) -> Option<&str> { + self.iter().find(|(k, _)| *k == key).map(|(_, v)| v) + } + pub fn iter(&self) -> impl Iterator + '_ { self.0 .iter() diff --git a/src/block.rs b/src/block.rs index 288364b..0b411eb 100644 --- a/src/block.rs +++ b/src/block.rs @@ -59,7 +59,9 @@ pub enum Leaf { /// Span is `#` characters. /// Each inline is a line. - Heading, + Heading { + has_section: bool, + }, /// Span is '|'. /// Has zero or one inline for the cell contents. @@ -254,7 +256,7 @@ impl<'s> TreeParser<'s> { fn parse_leaf( &mut self, - leaf: Leaf, + mut leaf: Leaf, k: &Kind, span: Span, lines: &mut [Span], @@ -300,6 +302,10 @@ impl<'s> TreeParser<'s> { } } + if let Leaf::Heading { has_section } = &mut leaf { + *has_section = top_level; + } + self.tree.enter(Node::Leaf(leaf), span); lines .iter() @@ -573,7 +579,7 @@ impl From<&Kind> for Block { match kind { Kind::Atom(a) => Self::Atom(*a), Kind::Paragraph => Self::Leaf(Paragraph), - Kind::Heading { .. } => Self::Leaf(Heading), + Kind::Heading { .. } => Self::Leaf(Heading { has_section: false }), Kind::Fenced { kind: FenceKind::CodeBlock(..), .. @@ -983,13 +989,13 @@ mod test { "## b\n", // ), (Enter(Container(Section)), "#"), - (Enter(Leaf(Heading)), "#"), + (Enter(Leaf(Heading { has_section: true })), "#"), (Inline, "a"), - (Exit(Leaf(Heading)), "#"), + (Exit(Leaf(Heading { has_section: true })), "#"), (Enter(Container(Section)), "##"), - (Enter(Leaf(Heading)), "##"), + (Enter(Leaf(Heading { has_section: true })), "##"), (Inline, "b"), - (Exit(Leaf(Heading)), "##"), + (Exit(Leaf(Heading { has_section: true })), "##"), (Exit(Container(Section)), "##"), (Exit(Container(Section)), "#"), ); @@ -1003,9 +1009,9 @@ mod test { "heading\n", // ), (Enter(Container(Section)), "#"), - (Enter(Leaf(Heading)), "#"), + (Enter(Leaf(Heading { has_section: true })), "#"), (Inline, "heading"), - (Exit(Leaf(Heading)), "#"), + (Exit(Leaf(Heading { has_section: true })), "#"), (Exit(Container(Section)), "#"), ); } @@ -1021,17 +1027,17 @@ mod test { "15\n", // ), (Enter(Container(Section)), "#"), - (Enter(Leaf(Heading)), "#"), + (Enter(Leaf(Heading { has_section: true })), "#"), (Inline, "2"), - (Exit(Leaf(Heading)), "#"), + (Exit(Leaf(Heading { has_section: true })), "#"), (Atom(Blankline), "\n"), (Exit(Container(Section)), "#"), (Enter(Container(Section)), "#"), - (Enter(Leaf(Heading)), "#"), + (Enter(Leaf(Heading { has_section: true })), "#"), (Inline, "8\n"), (Inline, "12\n"), (Inline, "15"), - (Exit(Leaf(Heading)), "#"), + (Exit(Leaf(Heading { has_section: true })), "#"), (Exit(Container(Section)), "#"), ); } @@ -1045,11 +1051,11 @@ mod test { "c\n", // ), (Enter(Container(Section)), "#"), - (Enter(Leaf(Heading)), "#"), + (Enter(Leaf(Heading { has_section: true })), "#"), (Inline, "a\n"), (Inline, "b\n"), (Inline, "c"), - (Exit(Leaf(Heading)), "#"), + (Exit(Leaf(Heading { has_section: true })), "#"), (Exit(Container(Section)), "#"), ); } @@ -1071,39 +1077,39 @@ mod test { "# b\n", ), (Enter(Container(Section)), "#"), - (Enter(Leaf(Heading)), "#"), + (Enter(Leaf(Heading { has_section: true })), "#"), (Inline, "a"), - (Exit(Leaf(Heading)), "#"), + (Exit(Leaf(Heading { has_section: true })), "#"), (Atom(Blankline), "\n"), (Enter(Container(Section)), "##"), - (Enter(Leaf(Heading)), "##"), + (Enter(Leaf(Heading { has_section: true })), "##"), (Inline, "aa"), - (Exit(Leaf(Heading)), "##"), + (Exit(Leaf(Heading { has_section: true })), "##"), (Atom(Blankline), "\n"), (Enter(Container(Section)), "####"), - (Enter(Leaf(Heading)), "####"), + (Enter(Leaf(Heading { has_section: true })), "####"), (Inline, "aaaa"), - (Exit(Leaf(Heading)), "####"), + (Exit(Leaf(Heading { has_section: true })), "####"), (Atom(Blankline), "\n"), (Exit(Container(Section)), "####"), (Exit(Container(Section)), "##"), (Enter(Container(Section)), "##"), - (Enter(Leaf(Heading)), "##"), + (Enter(Leaf(Heading { has_section: true })), "##"), (Inline, "ab"), - (Exit(Leaf(Heading)), "##"), + (Exit(Leaf(Heading { has_section: true })), "##"), (Atom(Blankline), "\n"), (Enter(Container(Section)), "###"), - (Enter(Leaf(Heading)), "###"), + (Enter(Leaf(Heading { has_section: true })), "###"), (Inline, "aba"), - (Exit(Leaf(Heading)), "###"), + (Exit(Leaf(Heading { has_section: true })), "###"), (Atom(Blankline), "\n"), (Exit(Container(Section)), "###"), (Exit(Container(Section)), "##"), (Exit(Container(Section)), "#"), (Enter(Container(Section)), "#"), - (Enter(Leaf(Heading)), "#"), + (Enter(Leaf(Heading { has_section: true })), "#"), (Inline, "b"), - (Exit(Leaf(Heading)), "#"), + (Exit(Leaf(Heading { has_section: true })), "#"), (Exit(Container(Section)), "#"), ); } @@ -1141,9 +1147,9 @@ mod test { (Inline, "a"), (Exit(Leaf(Paragraph)), ""), (Atom(Blankline), "\n"), - (Enter(Leaf(Heading)), "##"), + (Enter(Leaf(Heading { has_section: false })), "##"), (Inline, "hl"), - (Exit(Leaf(Heading)), "##"), + (Exit(Leaf(Heading { has_section: false })), "##"), (Atom(Blankline), "\n"), (Enter(Leaf(Paragraph)), ""), (Inline, "para"), diff --git a/src/html.rs b/src/html.rs index 37c61d3..8554f1b 100644 --- a/src/html.rs +++ b/src/html.rs @@ -148,7 +148,7 @@ impl<'s, I: Iterator>, W: std::fmt::Write> Writer<'s, I, W> { } Container::Table => self.out.write_str(" self.out.write_str(" self.out.write_str(" self.out.write_str(" self.out.write_str(" { if matches!(self.list_tightness.last(), Some(true)) { @@ -156,7 +156,7 @@ impl<'s, I: Iterator>, W: std::fmt::Write> Writer<'s, I, W> { } self.out.write_str(" write!(self.out, " write!(self.out, " self.out.write_str(" self.out.write_str(" self.out.write_str(">, W: std::fmt::Write> Writer<'s, I, W> { write!(self.out, r#" {}="{}""#, a, v)?; } + if let Container::Heading { + id, + has_section: false, + .. + } + | Container::Section { id } = &c + { + if !attrs.iter().any(|(a, _)| a == "id") { + write!(self.out, r#" id="{}""#, id)?; + } + } + if attrs.iter().any(|(a, _)| a == "class") || matches!( c, @@ -312,7 +324,7 @@ impl<'s, I: Iterator>, W: std::fmt::Write> Writer<'s, I, W> { } Container::Table => self.out.write_str("")?, Container::TableRow { .. } => self.out.write_str("")?, - Container::Section => self.out.write_str("")?, + Container::Section { .. } => self.out.write_str("")?, Container::Div { .. } => self.out.write_str("")?, Container::Paragraph => { if matches!(self.list_tightness.last(), Some(true)) { @@ -333,7 +345,7 @@ impl<'s, I: Iterator>, W: std::fmt::Write> Writer<'s, I, W> { } self.out.write_str("

")?; } - Container::Heading { level } => write!(self.out, "", level)?, + Container::Heading { level, .. } => write!(self.out, "", level)?, Container::TableCell { head: false, .. } => self.out.write_str("")?, Container::TableCell { head: true, .. } => self.out.write_str("")?, Container::Caption => self.out.write_str("")?, diff --git a/src/lib.rs b/src/lib.rs index 52f0265..f0bef23 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,5 @@ +use std::fmt::Write; + pub mod html; mod attr; @@ -49,13 +51,17 @@ pub enum Container<'s> { /// A row element of a table. TableRow { head: bool }, /// A section belonging to a top level heading. - Section, + Section { id: CowStr<'s> }, /// A block-level divider element. Div { class: Option<&'s str> }, /// A paragraph. Paragraph, /// A heading. - Heading { level: u16 }, + Heading { + level: u16, + has_section: bool, + id: CowStr<'s>, + }, /// A cell element of row within a table. TableCell { alignment: Alignment, head: bool }, /// A caption within a table. @@ -107,7 +113,7 @@ impl<'s> Container<'s> { | Self::Footnote { .. } | Self::Table | Self::TableRow { .. } - | Self::Section + | Self::Section { .. } | Self::Div { .. } | Self::Paragraph | Self::Heading { .. } @@ -144,7 +150,7 @@ impl<'s> Container<'s> { | Self::Footnote { .. } | Self::Table | Self::TableRow { .. } - | Self::Section + | Self::Section { .. } | Self::Div { .. } => true, Self::Paragraph | Self::Heading { .. } @@ -321,15 +327,12 @@ impl OrderedListStyle { pub struct Parser<'s> { src: &'s str, - /// Link definitions encountered during block parse, written once. - link_definitions: std::collections::HashMap<&'s str, (CowStr<'s>, attr::Attributes<'s>)>, - - /// Block tree cursor. + /// Block tree parsed at first. tree: block::Tree, - /// Spans to the inlines in the block currently being parsed. - inlines: span::InlineSpans<'s>, - /// Inline parser, recreated for each new inline. - inline_parser: Option>>, + + /// Contents obtained by the prepass. + pre_pass: PrePass<'s>, + /// Last parsed block attributes block_attributes: Attributes<'s>, @@ -344,47 +347,168 @@ pub struct Parser<'s> { footnote_index: usize, /// Currently within a footnote. footnote_active: bool, + + /// Spans to the inlines in the leaf block currently being parsed. + inlines: span::InlineSpans<'s>, + /// Inline parser, recreated for each new inline. + inline_parser: Option>>, +} + +struct Heading { + /// Location of heading in src. + location: usize, + /// Automatically generated id from heading text. + id_auto: String, + /// Overriding id from an explicit attribute on the heading. + id_override: Option, +} + +/// Because of potential future references, an initial pass is required to obtain all definitions. +struct PrePass<'s> { + /// Link definitions and their attributes. + link_definitions: std::collections::HashMap<&'s str, (CowStr<'s>, attr::Attributes<'s>)>, + /// Cache of all heading ids. + headings: Vec, + /// Indices to headings sorted lexicographically. + headings_lex: Vec, +} + +impl<'s> PrePass<'s> { + #[must_use] + fn new(src: &'s str, mut tree: block::Tree) -> Self { + let mut link_definitions = std::collections::HashMap::new(); + let mut headings: Vec = Vec::new(); + + let mut inlines = span::InlineSpans::new(src); + + let mut attr_prev: Option = None; + while let Some(e) = tree.next() { + match e.kind { + tree::EventKind::Enter(block::Node::Leaf(block::Leaf::LinkDefinition)) => { + // All link definition tags have to be obtained initially, as references can + // appear before the definition. + let tag = e.span.of(src); + let attrs = + attr_prev.map_or_else(Attributes::new, |sp| attr::parse(sp.of(src))); + let url = match tree.count_children() { + 0 => "".into(), + 1 => tree.take_inlines().next().unwrap().of(src).trim().into(), + _ => tree.take_inlines().map(|sp| sp.of(src).trim()).collect(), + }; + link_definitions.insert(tag, (url, attrs)); + } + tree::EventKind::Enter(block::Node::Leaf(block::Leaf::Heading { .. })) => { + // All headings ids have to be obtained initially, as references can appear + // before the heading. Additionally, determining the id requires inline parsing + // as formatting must be removed. + // + // We choose to parse all headers twice instead of caching them. + let attrs = attr_prev.map(|sp| attr::parse(sp.of(src))); + let id_override = attrs + .as_ref() + .and_then(|attrs| attrs.get("id")) + .map(ToString::to_string); + + inlines.set_spans(tree.take_inlines()); + let mut id_auto = String::new(); + inline::Parser::new(inlines.chars()).for_each(|ev| match ev.kind { + inline::EventKind::Str => { + let mut chars = inlines.slice(ev.span).chars().peekable(); + while let Some(c) = chars.next() { + if c.is_whitespace() { + while chars.peek().map_or(false, |c| c.is_whitespace()) { + chars.next(); + } + if !id_auto.is_empty() { + id_auto.push('-'); + } + } else if !c.is_ascii_punctuation() || matches!(c, '-' | '_') { + id_auto.push(c); + } + } + } + inline::EventKind::Atom(inline::Atom::Softbreak) => { + id_auto.push('-'); + } + _ => {} + }); + id_auto.drain(id_auto.trim_end_matches('-').len()..); + + // ensure id unique + if headings.iter().any(|h| h.id_auto == id_auto) || id_auto.is_empty() { + if id_auto.is_empty() { + id_auto.push('s'); + } + let mut num = 1; + id_auto.push('-'); + let i_num = id_auto.len(); + write!(id_auto, "{}", num).unwrap(); + while headings.iter().any(|h| h.id_auto == id_auto) { + num += 1; + id_auto.drain(i_num..); + write!(id_auto, "{}", num).unwrap(); + } + } + + headings.push(Heading { + location: e.span.start(), + id_auto, + id_override, + }); + } + tree::EventKind::Atom(block::Atom::Attributes) => { + attr_prev = Some(e.span); + } + tree::EventKind::Enter(..) + | tree::EventKind::Exit(block::Node::Container(block::Container::Section { + .. + })) => {} + _ => { + attr_prev = None; + } + } + } + + let mut headings_lex = (0..headings.len()).collect::>(); + headings_lex.sort_by_key(|i| &headings[*i].id_auto); + + Self { + link_definitions, + headings, + headings_lex, + } + } + + fn heading_id(&self, i: usize) -> &str { + let h = &self.headings[i]; + h.id_override.as_ref().unwrap_or(&h.id_auto) + } + + fn heading_id_by_location(&self, location: usize) -> Option<&str> { + self.headings + .binary_search_by_key(&location, |h| h.location) + .ok() + .map(|i| self.heading_id(i)) + } + + fn heading_id_by_tag(&self, tag: &str) -> Option<&str> { + self.headings_lex + .binary_search_by_key(&tag, |i| &self.headings[*i].id_auto) + .ok() + .map(|i| self.heading_id(i)) + } } impl<'s> Parser<'s> { #[must_use] pub fn new(src: &'s str) -> Self { let tree = block::parse(src); - - // All link definition tags have to be obtained initially, as references can appear before - // the definition. - let link_definitions = { - let mut branch = tree.clone(); - let mut defs = std::collections::HashMap::new(); - let mut attr_prev: Option = None; - while let Some(e) = branch.next() { - if let tree::EventKind::Enter(block::Node::Leaf(block::Leaf::LinkDefinition)) = - e.kind - { - let tag = e.span.of(src); - let attrs = - attr_prev.map_or_else(Attributes::new, |sp| attr::parse(sp.of(src))); - let url = match branch.count_children() { - 0 => "".into(), - 1 => branch.take_inlines().next().unwrap().of(src).trim().into(), - _ => branch.take_inlines().map(|sp| sp.of(src).trim()).collect(), - }; - defs.insert(tag, (url, attrs)); - } else if let tree::EventKind::Atom(block::Atom::Attributes) = e.kind { - attr_prev = Some(e.span); - } else { - attr_prev = None; - } - } - defs - }; - - let branch = tree.clone(); + let pre_pass = PrePass::new(src, tree.clone()); Self { src, - link_definitions, - tree: branch, + tree, + pre_pass, block_attributes: Attributes::new(), table_head_row: false, footnote_references: Vec::new(), @@ -453,12 +577,18 @@ impl<'s> Parser<'s> { CowStr::Owned(s) => s.replace('\n', " ").into(), s @ CowStr::Borrowed(_) => s, }; - let (url, attrs_def) = self - .link_definitions - .get(tag.as_ref()) - .cloned() - .unwrap_or_else(|| ("".into(), Attributes::new())); - attributes.union(attrs_def); + let link_def = + self.pre_pass.link_definitions.get(tag.as_ref()).cloned(); + + let url = if let Some((url, attrs_def)) = link_def { + attributes.union(attrs_def); + url + } else { + self.pre_pass + .heading_id_by_tag(tag.as_ref()) + .map_or_else(|| "".into(), |id| format!("#{}", id).into()) + }; + if matches!(c, inline::Container::ReferenceLink) { Container::Link(url, LinkType::Span(SpanLinkType::Reference)) } else { @@ -561,8 +691,15 @@ impl<'s> Parser<'s> { } match l { block::Leaf::Paragraph => Container::Paragraph, - block::Leaf::Heading => Container::Heading { + block::Leaf::Heading { has_section } => Container::Heading { level: content.len().try_into().unwrap(), + has_section, + id: self + .pre_pass + .heading_id_by_location(ev.span.start()) + .unwrap_or_default() + .to_string() + .into(), }, block::Leaf::CodeBlock => { if let Some(format) = content.strip_prefix('=') { @@ -631,7 +768,14 @@ impl<'s> Parser<'s> { } Container::TableRow { head } } - block::Container::Section => Container::Section, + block::Container::Section => Container::Section { + id: self + .pre_pass + .heading_id_by_location(ev.span.start()) + .unwrap_or_default() + .to_string() + .into(), + }, }, }; if enter { @@ -751,20 +895,49 @@ mod test { fn heading() { test_parse!( "#\n", - Start(Section, Attributes::new()), - Start(Heading { level: 1 }, Attributes::new()), - End(Heading { level: 1 }), - End(Section), + Start(Section { id: "s-1".into() }, Attributes::new()), + Start( + Heading { + level: 1, + has_section: true, + id: "s-1".into() + }, + Attributes::new() + ), + End(Heading { + level: 1, + has_section: true, + id: "s-1".into() + }), + End(Section { id: "s-1".into() }), ); test_parse!( "# abc\ndef\n", - Start(Section, Attributes::new()), - Start(Heading { level: 1 }, Attributes::new()), + Start( + Section { + id: "abc-def".into() + }, + Attributes::new() + ), + Start( + Heading { + level: 1, + has_section: true, + id: "abc-def".into() + }, + Attributes::new() + ), Str("abc".into()), Atom(Softbreak), Str("def".into()), - End(Heading { level: 1 }), - End(Section), + End(Heading { + level: 1, + has_section: true, + id: "abc-def".into(), + }), + End(Section { + id: "abc-def".into() + }), ); } @@ -776,16 +949,41 @@ mod test { "{a=b}\n", "# def\n", // ), - Start(Section, Attributes::new()), - Start(Heading { level: 1 }, Attributes::new()), + Start(Section { id: "abc".into() }, Attributes::new()), + Start( + Heading { + level: 1, + has_section: true, + id: "abc".into() + }, + Attributes::new() + ), Str("abc".into()), - End(Heading { level: 1 }), - End(Section), - Start(Section, [("a", "b")].into_iter().collect(),), - Start(Heading { level: 1 }, Attributes::new(),), + End(Heading { + level: 1, + has_section: true, + id: "abc".into(), + }), + End(Section { id: "abc".into() }), + Start( + Section { id: "def".into() }, + [("a", "b")].into_iter().collect(), + ), + Start( + Heading { + level: 1, + has_section: true, + id: "def".into() + }, + Attributes::new(), + ), Str("def".into()), - End(Heading { level: 1 }), - End(Section), + End(Heading { + level: 1, + has_section: true, + id: "def".into(), + }), + End(Section { id: "def".into() }), ); } diff --git a/src/span.rs b/src/span.rs index 3874063..126f9b4 100644 --- a/src/span.rs +++ b/src/span.rs @@ -302,7 +302,7 @@ impl<'s, 'i> DiscontinuousString<'s> for InlineSpansSlice<'s, 'i> { } } -type InlineSpansSliceIter<'i> = std::iter::Chain< +pub type InlineSpansSliceIter<'i> = std::iter::Chain< std::iter::Chain, std::iter::Copied>>, std::iter::Once, >;