diff --git a/src/attr.rs b/src/attr.rs new file mode 100644 index 0000000..6ba823c --- /dev/null +++ b/src/attr.rs @@ -0,0 +1,328 @@ +use crate::CowStr; +use crate::DiscontinuousString; +use crate::Span; + +use State::*; + +pub fn valid>(chars: I) -> bool { + !Parser::new(chars).any(|e| matches!(e, Element::Invalid)) +} + +// Attributes are relatively rare, we choose to pay 8 bytes always and sometimes an extra +// indirection instead of always 24 bytes. +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct Attributes<'s>(Option)>>>); + +impl<'s> Attributes<'s> { + #[must_use] + pub fn new() -> Self { + Self::default() + } + + #[must_use] + pub fn take(&mut self) -> Self { + Self(self.0.take()) + } + + pub(crate) fn parse>(&mut self, input: &S) -> bool { + for elem in Parser::new(input.chars()) { + match elem { + Element::Class(c) => self.add("class", input.src(c)), + Element::Identifier(i) => self.add("id", input.src(i)), + Element::Attribute(a, v) => self.add( + match input.src(a) { + CowStr::Owned(_) => panic!(), + CowStr::Borrowed(s) => s, + }, + input.src(v), + ), + Element::Invalid => return false, + } + } + true + } + + fn add(&mut self, attr: &'s str, val: CowStr<'s>) { + if self.0.is_none() { + self.0 = Some(Vec::new().into()); + }; + + let attrs = self.0.as_mut().unwrap(); + attrs.push((attr, val)); + } + + #[cfg(test)] + pub fn iter(&self) -> impl Iterator + '_ { + self.0 + .iter() + .flat_map(|v| v.iter().map(|(a, b)| (*a, b.as_ref()))) + } +} + +#[cfg(test)] +impl<'s> FromIterator<(&'s str, &'s str)> for Attributes<'s> { + fn from_iter>(iter: I) -> Self { + let attrs = iter + .into_iter() + .map(|(a, v)| (a, v.into())) + .collect::>(); + if attrs.is_empty() { + Attributes::new() + } else { + Attributes(Some(attrs.into())) + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum State { + Start, + Whitespace, + Comment, + ClassFirst, + Class, + IdentifierFirst, + Identifier, + Attribute, + ValueFirst, + Value, + ValueQuoted, + Done, + Invalid, +} + +struct Parser { + chars: I, + pos: usize, + state: State, +} + +impl> Parser { + fn new(chars: I) -> Self { + Parser { + chars, + pos: 0, + state: Start, + } + } + + fn step_char(&mut self) -> Option { + self.chars.next().map(|c| { + self.pos += c.len_utf8(); + match self.state { + Start => match c { + '{' => Whitespace, + c if c.is_whitespace() => Start, + _ => Invalid, + }, + Whitespace => match c { + '}' => Done, + '.' => ClassFirst, + '#' => IdentifierFirst, + '%' => Comment, + c if c.is_ascii_alphanumeric() || matches!(c, '_' | ':' | '-') => Attribute, + c if c.is_whitespace() => Whitespace, + _ => Invalid, + }, + Comment => { + if c == '%' { + Whitespace + } else { + Comment + } + } + s @ (ClassFirst | IdentifierFirst) => { + if is_name_start(c) { + match s { + ClassFirst => Class, + IdentifierFirst => Identifier, + _ => panic!(), + } + } else { + Invalid + } + } + s @ (Class | Identifier | Value) => { + if is_name(c) { + s + } else if c.is_whitespace() { + Whitespace + } else if c == '}' { + Done + } else { + Invalid + } + } + Attribute => { + if is_name(c) { + Attribute + } else if c == '=' { + ValueFirst + } else { + Invalid + } + } + ValueFirst => { + if is_name(c) { + Value + } else if c == '"' { + ValueQuoted + } else { + Invalid + } + } + ValueQuoted => { + if c == '"' { + Whitespace + } else { + ValueQuoted + } + } + Done => { + if c.is_whitespace() { + Done + } else { + Invalid + } + } + Invalid => panic!(), + } + }) + } + + fn step(&mut self) -> (State, Span) { + let start = self.pos.saturating_sub(1); + + while let Some(state_next) = self.step_char() { + if self.state != state_next { + return ( + std::mem::replace(&mut self.state, state_next), + Span::new(start, self.pos - 1), + ); + } + } + + ( + if self.state == Done { Done } else { Invalid }, + Span::new(start, self.pos), + ) + } +} + +fn is_name_start(c: char) -> bool { + c.is_ascii_alphanumeric() || matches!(c, '_' | ':') +} + +fn is_name(c: char) -> bool { + is_name_start(c) || c.is_ascii_digit() || matches!(c, '-' | '.') +} + +enum Element { + Class(Span), + Identifier(Span), + Attribute(Span, Span), + Invalid, +} + +impl> Iterator for Parser { + type Item = Element; + + fn next(&mut self) -> Option { + loop { + let (st, span0) = self.step(); + return match st { + ClassFirst | IdentifierFirst => { + let (st, span1) = self.step(); + Some(match st { + Class => Element::Class(span1), + Identifier => Element::Identifier(span1), + _ => return Some(Element::Invalid), + }) + } + Attribute => { + let (st, _span1) = self.step(); + match st { + ValueFirst => { + let (st, span2) = self.step(); + match st { + Value => Some(Element::Attribute(span0, span2)), + ValueQuoted => Some(Element::Attribute(span0, span2.skip(1))), + Invalid => Some(Element::Invalid), + _ => panic!("{:?}", st), + } + } + Invalid => Some(Element::Invalid), + _ => panic!("{:?}", st), + } + } + Comment | Start | Whitespace => continue, + Done => None, + Invalid => Some(Element::Invalid), + _ => panic!("{:?}", st), + }; + } + } +} + +#[cfg(test)] +mod test { + macro_rules! test_attr { + ($src:expr $(,$($av:expr),* $(,)?)?) => { + #[allow(unused)] + let mut attr =super::Attributes::new(); + attr.parse(&$src); + let actual = attr.iter().collect::>(); + let expected = &[$($($av),*,)?]; + assert_eq!(actual, expected, "\n\n{}\n\n", $src); + }; + } + + #[test] + fn empty() { + test_attr!("{}"); + } + + #[test] + fn class_id() { + test_attr!( + "{.some_class #some_id}", + ("class", "some_class"), + ("id", "some_id"), + ); + } + + #[test] + fn value_unquoted() { + test_attr!( + "{attr0=val0 attr1=val1}", + ("attr0", "val0"), + ("attr1", "val1"), + ); + } + + #[test] + fn value_quoted() { + test_attr!( + r#"{attr0="val0" attr1="val1"}"#, + ("attr0", "val0"), + ("attr1", "val1"), + ); + test_attr!( + r#"{#id .class style="color:red"}"#, + ("id", "id"), + ("class", "class"), + ("style", "color:red") + ); + } + + #[test] + fn comment() { + test_attr!("{%%}"); + test_attr!("{ % abc % }"); + test_attr!( + "{ .some_class % abc % #some_id}", + ("class", "some_class"), + ("id", "some_id"), + ); + } +} diff --git a/src/block.rs b/src/block.rs index 1393e35..b504603 100644 --- a/src/block.rs +++ b/src/block.rs @@ -1,6 +1,7 @@ use crate::Span; use crate::EOF; +use crate::attr; use crate::tree; use Atom::*; @@ -70,7 +71,7 @@ pub enum Leaf { #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Container { - /// Span is first `>` character. + /// Span is `>`. Blockquote, /// Span is class specifier. @@ -79,7 +80,7 @@ pub enum Container { /// Span is the list marker. ListItem, - /// Span is first `[^` instance. + /// Span is `[^`. Footnote, } @@ -269,6 +270,8 @@ impl BlockParser { )) } } + '{' => attr::valid(line_t.chars()) + .then(|| (Block::Atom(Attributes), Span::by_len(start, line_t.len()))), '|' => (&line_t[line_t.len() - 1..] == "|" && &line_t[line_t.len() - 2..line_t.len() - 1] != "\\") .then(|| (Block::Leaf(Table), Span::by_len(start, 1))), @@ -614,6 +617,17 @@ mod test { ); } + #[test] + fn parse_attr() { + test_parse!( + "{.some_class}\npara\n", + (Atom(Attributes), "{.some_class}\n"), + (Enter(Leaf(Paragraph)), ""), + (Inline, "para"), + (Exit(Leaf(Paragraph)), ""), + ); + } + macro_rules! test_block { ($src:expr, $kind:expr, $str:expr, $len:expr $(,)?) => { let lines = super::lines($src).map(|sp| sp.of($src)); diff --git a/src/lib.rs b/src/lib.rs index afad3aa..9b345d7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ pub mod html; +mod attr; mod block; mod inline; mod lex; @@ -8,6 +9,8 @@ mod tree; use span::Span; +pub use attr::Attributes; + type CowStr<'s> = std::borrow::Cow<'s, str>; const EOF: char = '\0'; @@ -217,11 +220,11 @@ pub enum Atom { EmDash, /// A thematic break, typically a horizontal rule. ThematicBreak, - /// A space that may not break a line. + /// A space that must not break a line. NonBreakingSpace, - /// A newline that may or may not break a line in the output format. + /// A newline that may or may not break a line in the output. Softbreak, - /// A newline that must break a line. + /// A newline that must break a line in the output. Hardbreak, /// An escape character, not visible in output. Escape, @@ -251,27 +254,6 @@ impl<'s> Container<'s> { } } -// Attributes are relatively rare, we choose to pay 8 bytes always and sometimes an extra -// indirection instead of always 24 bytes. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Attributes<'s>(Option>>); - -impl<'s> Attributes<'s> { - #[must_use] - pub fn none() -> Self { - Self(None) - } - - #[must_use] - pub fn take(&mut self) -> Self { - Self(self.0.take()) - } - - pub fn parse(&mut self, src: &'s str) { - todo!() - } -} - #[derive(Clone)] struct InlineChars<'s, 't> { src: &'s str, @@ -300,10 +282,86 @@ impl<'s, 't> Iterator for InlineChars<'s, 't> { } } +trait DiscontinuousString<'s> { + type Chars: Iterator; + + fn src(&self, span: Span) -> CowStr<'s>; + + fn chars(&self) -> Self::Chars; +} + +impl<'s> DiscontinuousString<'s> for &'s str { + type Chars = std::str::Chars<'s>; + + fn src(&self, span: Span) -> CowStr<'s> { + span.of(self).into() + } + + fn chars(&self) -> Self::Chars { + str::chars(self) + } +} + +impl<'s> DiscontinuousString<'s> for InlineSpans<'s> { + type Chars = InlineChars<'s, 'static>; + + /// Borrow if continuous, copy if discontiunous. + fn src(&self, span: Span) -> CowStr<'s> { + let mut a = 0; + let mut s = String::new(); + for sp in &self.spans { + let b = a + sp.len(); + if span.start() < b { + let r = if a <= span.start() { + if span.end() <= b { + // continuous + return CowStr::Borrowed( + &sp.of(self.src)[span.start() - a..span.end() - a], + ); + } + (span.start() - a)..sp.len() + } else { + 0..sp.len().min(span.end() - a) + }; + s.push_str(&sp.of(self.src)[r]); + } + a = b; + } + assert_eq!(span.len(), s.len()); + CowStr::Owned(s) + } + + fn chars(&self) -> Self::Chars { + // SAFETY: do not call set_spans while chars is in use + unsafe { std::mem::transmute(InlineChars::new(self.src, &self.spans)) } + } +} + +#[derive(Default)] +struct InlineSpans<'s> { + src: &'s str, + spans: Vec, +} + +impl<'s> InlineSpans<'s> { + fn new(src: &'s str) -> Self { + Self { + src, + spans: Vec::new(), + } + } + + fn set_spans(&mut self, spans: impl Iterator) { + // avoid allocating new vec if size is sufficient + self.spans.clear(); + self.spans.extend(spans); + } +} + pub struct Parser<'s> { src: &'s str, tree: block::Tree, - inline_spans: Vec, + inlines: InlineSpans<'s>, inline_parser: Option>>, inline_start: usize, block_attributes: Attributes<'s>, @@ -315,10 +373,10 @@ impl<'s> Parser<'s> { Self { src, tree: block::parse(src), - inline_spans: Vec::new(), + inlines: InlineSpans::new(src), inline_parser: None, inline_start: 0, - block_attributes: Attributes::none(), + block_attributes: Attributes::new(), } } } @@ -333,7 +391,10 @@ impl<'s> Parser<'s> { inline::Container::InlineMath => Container::Math { display: false }, inline::Container::DisplayMath => Container::Math { display: true }, inline::Container::RawFormat => Container::RawInline { - format: self.inline_str_cont(inline.span), + format: match self.inlines.src(inline.span) { + CowStr::Owned(_) => panic!(), + CowStr::Borrowed(s) => s, + }, }, inline::Container::Subscript => Container::Subscript, inline::Container::Superscript => Container::Superscript, @@ -345,14 +406,14 @@ impl<'s> Parser<'s> { inline::Container::SingleQuoted => Container::SingleQuoted, inline::Container::DoubleQuoted => Container::DoubleQuoted, inline::Container::InlineLink => Container::Link( - match self.inline_str(inline.span) { + match self.inlines.src(inline.span) { CowStr::Owned(s) => s.replace('\n', "").into(), s @ CowStr::Borrowed(_) => s, }, LinkType::Span(SpanLinkType::Inline), ), inline::Container::InlineImage => Container::Image( - match self.inline_str(inline.span) { + match self.inlines.src(inline.span) { CowStr::Owned(s) => s.replace('\n', "").into(), s @ CowStr::Borrowed(_) => s, }, @@ -361,7 +422,7 @@ impl<'s> Parser<'s> { _ => todo!("{:?}", c), }; if matches!(inline.kind, inline::EventKind::Enter(_)) { - Event::Start(t, Attributes::none()) + Event::Start(t, Attributes::new()) } else { Event::End(t) } @@ -375,38 +436,10 @@ impl<'s> Parser<'s> { inline::Atom::Hardbreak => Event::Atom(Atom::Hardbreak), inline::Atom::Escape => Event::Atom(Atom::Escape), }, - inline::EventKind::Str => Event::Str(self.inline_str(inline.span)), + inline::EventKind::Str => Event::Str(self.inlines.src(inline.span)), inline::EventKind::Attributes => todo!(), } } - - fn inline_str_cont(&self, span: Span) -> &'s str { - span.translate(self.inline_spans[0].start()).of(self.src) - } - - /// Copy string if discontinuous. - fn inline_str(&self, span: Span) -> CowStr<'s> { - let mut a = 0; - let mut s = String::new(); - for sp in &self.inline_spans { - let b = a + sp.len(); - if span.start() < b { - let r = if a <= span.start() { - if span.end() <= b { - // continuous - return CowStr::Borrowed(self.inline_str_cont(span)); - } - (span.start() - a)..sp.len() - } else { - 0..sp.len().min(span.end() - a) - }; - s.push_str(&sp.of(self.src)[r]); - } - a = b; - } - assert_eq!(span.len(), s.len()); - CowStr::Owned(s) - } } impl<'s> Iterator for Parser<'s> { @@ -415,6 +448,7 @@ impl<'s> Iterator for Parser<'s> { fn next(&mut self) -> Option { if let Some(parser) = &mut self.inline_parser { if let Some(inline) = parser.next() { + // SAFETY: cannot set lifetime 's on self due to trait return Some(self.inline(inline)); } self.inline_parser = None; @@ -427,17 +461,15 @@ impl<'s> Iterator for Parser<'s> { block::Atom::Blankline => Event::Atom(Atom::Blankline), block::Atom::ThematicBreak => Event::Atom(Atom::ThematicBreak), block::Atom::Attributes => { - self.block_attributes.parse(content); + self.block_attributes.parse(&content); + dbg!(&self.block_attributes); continue; } }, tree::EventKind::Enter(c) => match c { block::Node::Leaf(l) => { - self.inline_spans = self.tree.inlines().collect(); - let chars = InlineChars::new(self.src, unsafe { - std::mem::transmute(self.inline_spans.as_slice()) - }); - self.inline_parser = Some(inline::Parser::new(chars)); + self.inlines.set_spans(self.tree.inlines()); + self.inline_parser = Some(inline::Parser::new(self.inlines.chars())); self.inline_start = ev.span.end(); let container = match l { block::Leaf::CodeBlock { .. } => { @@ -538,15 +570,15 @@ mod test { fn heading() { test_parse!( "#\n", - Start(Heading { level: 1 }, Attributes::none()), + Start(Heading { level: 1 }, Attributes::new()), End(Heading { level: 1 }), ); test_parse!( "# abc\ndef\n", - Start(Heading { level: 1 }, Attributes::none()), - Str(CowStr::Borrowed("abc")), + Start(Heading { level: 1 }, Attributes::new()), + Str("abc".into()), Atom(Softbreak), - Str(CowStr::Borrowed("def")), + Str("def".into()), End(Heading { level: 1 }), ); } @@ -555,7 +587,7 @@ mod test { fn blockquote() { test_parse!( ">\n", - Start(Blockquote, Attributes::none()), + Start(Blockquote, Attributes::new()), Atom(Blankline), End(Blockquote), ); @@ -565,24 +597,24 @@ mod test { fn para() { test_parse!( "para", - Start(Paragraph, Attributes::none()), - Str(CowStr::Borrowed("para")), + Start(Paragraph, Attributes::new()), + Str("para".into()), End(Paragraph), ); test_parse!( "pa ra", - Start(Paragraph, Attributes::none()), - Str(CowStr::Borrowed("pa ra")), + Start(Paragraph, Attributes::new()), + Str("pa ra".into()), End(Paragraph), ); test_parse!( "para0\n\npara1", - Start(Paragraph, Attributes::none()), - Str(CowStr::Borrowed("para0")), + Start(Paragraph, Attributes::new()), + Str("para0".into()), End(Paragraph), Atom(Blankline), - Start(Paragraph, Attributes::none()), - Str(CowStr::Borrowed("para1")), + Start(Paragraph, Attributes::new()), + Str("para1".into()), End(Paragraph), ); } @@ -591,9 +623,9 @@ mod test { fn verbatim() { test_parse!( "`abc\ndef", - Start(Paragraph, Attributes::none()), - Start(Verbatim, Attributes::none()), - Str(CowStr::Borrowed("abc\ndef")), + Start(Paragraph, Attributes::new()), + Start(Verbatim, Attributes::new()), + Str("abc\ndef".into()), End(Verbatim), End(Paragraph), ); @@ -602,10 +634,10 @@ mod test { "> `abc\n", "> def\n", // ), - Start(Blockquote, Attributes::none()), - Start(Paragraph, Attributes::none()), - Start(Verbatim, Attributes::none()), - Str(CowStr::Owned("abc\ndef".to_string())), + Start(Blockquote, Attributes::new()), + Start(Paragraph, Attributes::new()), + Start(Verbatim, Attributes::new()), + Str("abc\ndef".into()), End(Verbatim), End(Paragraph), End(Blockquote), @@ -616,9 +648,9 @@ mod test { fn raw_inline() { test_parse!( "``raw\nraw``{=format}", - Start(Paragraph, Attributes::none()), - Start(RawInline { format: "format" }, Attributes::none()), - Str(CowStr::Borrowed("raw\nraw")), + Start(Paragraph, Attributes::new()), + Start(RawInline { format: "format" }, Attributes::new()), + Str("raw\nraw".into()), End(RawInline { format: "format" }), End(Paragraph), ); @@ -628,19 +660,13 @@ mod test { fn link_inline() { test_parse!( "[text](url)", - Start(Paragraph, Attributes::none()), + Start(Paragraph, Attributes::new()), Start( - Link( - CowStr::Borrowed("url"), - LinkType::Span(SpanLinkType::Inline), - ), - Attributes::none() + Link("url".into(), LinkType::Span(SpanLinkType::Inline)), + Attributes::new() ), - Str(CowStr::Borrowed("text")), - End(Link( - CowStr::Borrowed("url"), - LinkType::Span(SpanLinkType::Inline) - )), + Str("text".into()), + End(Link("url".into(), LinkType::Span(SpanLinkType::Inline))), End(Paragraph), ); test_parse!( @@ -648,22 +674,26 @@ mod test { "> [text](url\n", "> url)\n", // ), - Start(Blockquote, Attributes::none()), - Start(Paragraph, Attributes::none()), + Start(Blockquote, Attributes::new()), + Start(Paragraph, Attributes::new()), Start( - Link( - CowStr::Owned("urlurl".to_string()), - LinkType::Span(SpanLinkType::Inline) - ), - Attributes::none() + Link("urlurl".into(), LinkType::Span(SpanLinkType::Inline)), + Attributes::new() ), - Str(CowStr::Borrowed("text")), - End(Link( - CowStr::Borrowed("urlurl"), - LinkType::Span(SpanLinkType::Inline) - )), + Str("text".into()), + End(Link("urlurl".into(), LinkType::Span(SpanLinkType::Inline))), End(Paragraph), End(Blockquote), ); } + + #[test] + fn attr_block() { + test_parse!( + "{.some_class}\npara\n", + Start(Paragraph, [("class", "some_class")].into_iter().collect()), + Str("para".into()), + End(Paragraph), + ); + } }