jotdown/src/lib.rs

457 lines
14 KiB
Rust
Raw Normal View History

2022-11-29 12:34:13 -05:00
pub mod html;
2022-11-12 12:45:17 -05:00
mod block;
2022-11-16 16:11:55 -05:00
mod inline;
2022-11-20 13:13:48 -05:00
mod lex;
2022-11-12 12:45:17 -05:00
mod span;
mod tree;
2022-11-28 14:12:49 -05:00
use span::Span;
2022-11-20 13:13:48 -05:00
pub struct Block;
2022-11-12 12:45:17 -05:00
const EOF: char = '\0';
2022-11-28 14:12:49 -05:00
#[derive(Debug, PartialEq, Eq)]
2022-11-28 18:33:43 -05:00
pub enum Event<'s> {
2022-11-29 12:34:13 -05:00
/// Start of a container.
Start(Container<'s>, Attributes<'s>),
/// End of a container.
End(Container<'s>),
2022-11-28 15:52:09 -05:00
/// A string object, text only.
Str(&'s str),
2022-11-29 12:34:13 -05:00
/// An atomic element.
Atom(Atom),
2022-11-28 15:52:09 -05:00
/// A verbatim string.
Verbatim(&'s str),
/// An inline or display math element.
Math { content: &'s str, display: bool },
2022-11-27 15:59:54 -05:00
}
2022-11-28 14:12:49 -05:00
#[derive(Debug, PartialEq, Eq)]
2022-11-29 12:34:13 -05:00
pub enum Container<'s> {
/// A blockquote element.
Blockquote,
/// A list.
List(List),
/// An item of a list
ListItem,
/// A description list element.
DescriptionList,
/// Details describing a term within a description list.
DescriptionDetails,
/// A footnote definition.
Footnote { tag: &'s str },
/// A table element.
Table,
/// A row element of a table.
TableRow,
/// A block-level divider element.
Div,
2022-11-28 15:52:09 -05:00
/// A paragraph.
2022-11-27 15:59:54 -05:00
Paragraph,
2022-11-28 15:52:09 -05:00
/// A heading.
2022-11-27 15:59:54 -05:00
Heading { level: u8 },
2022-11-28 15:52:09 -05:00
/// A cell element of row within a table.
2022-11-27 15:59:54 -05:00
TableCell,
/// A term within a description list.
DescriptionTerm,
2022-11-28 15:52:09 -05:00
/// A block with raw markup for a specific output format.
2022-11-27 15:59:54 -05:00
RawBlock { format: &'s str },
2022-11-28 15:52:09 -05:00
/// A block with code in a specific language.
CodeBlock { lang: Option<&'s str> },
/// An inline divider element.
Span,
/// An inline link with a destination URL.
Link(&'s str, LinkType),
/// An inline image.
Image(&'s str),
2022-11-28 18:33:43 -05:00
/// A subscripted element.
Subscript,
/// A superscripted element.
Superscript,
/// An inserted inline element.
2022-11-28 18:33:43 -05:00
Insert,
/// A deleted inline element.
2022-11-28 18:33:43 -05:00
Delete,
/// An inline element emphasized with a bold typeface.
2022-11-28 18:33:43 -05:00
Strong,
/// An emphasized inline element.
2022-11-28 18:33:43 -05:00
Emphasis,
/// A highlighted inline element.
Mark,
/// An quoted inline element, using single quotes.
2022-11-28 18:33:43 -05:00
SingleQuoted,
/// A quoted inline element, using double quotes.
DoubleQuoted,
2022-11-27 15:59:54 -05:00
}
impl<'s> Container<'s> {
/// Is a block element.
fn is_block(&self) -> bool {
match self {
Self::Blockquote
| Self::List(..)
| Self::ListItem
| Self::DescriptionList
| Self::DescriptionDetails
| Self::Footnote { .. }
| Self::Table
| Self::TableRow
| Self::Div
| Self::Paragraph
| Self::Heading { .. }
| Self::DescriptionTerm
| Self::TableCell
| Self::RawBlock { .. }
| Self::CodeBlock { .. } => true,
Self::Span
| Self::Link(..)
| Self::Image(..)
| Self::Subscript
| Self::Superscript
| Self::Insert
| Self::Delete
| Self::Strong
| Self::Emphasis
| Self::Mark
| Self::SingleQuoted
| Self::DoubleQuoted => false,
}
}
/// Is a block element that may contain children blocks.
fn is_block_container(&self) -> bool {
match self {
Self::Blockquote
| Self::List(..)
| Self::ListItem
| Self::DescriptionList
| Self::DescriptionDetails
| Self::Footnote { .. }
| Self::Table
| Self::TableRow
| Self::Div => true,
Self::Paragraph
| Self::Heading { .. }
| Self::TableCell
| Self::DescriptionTerm
| Self::RawBlock { .. }
| Self::CodeBlock { .. }
| Self::Span
| Self::Link(..)
| Self::Image(..)
| Self::Subscript
| Self::Superscript
| Self::Insert
| Self::Delete
| Self::Strong
| Self::Emphasis
| Self::Mark
| Self::SingleQuoted
| Self::DoubleQuoted => false,
}
}
}
2022-11-28 14:12:49 -05:00
#[derive(Debug, PartialEq, Eq)]
2022-11-28 15:52:09 -05:00
pub enum LinkType {
Inline,
Reference,
Autolink,
Email,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum List {
Unordered,
2022-12-02 02:16:47 -05:00
Ordered { kind: OrderedListKind, start: u32 },
2022-11-28 15:52:09 -05:00
Description,
Task(bool),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OrderedListKind {
/// Decimal numbering, e.g. `1)`.
Decimal,
/// Lowercase alphabetic numbering, e.g. `a)`.
AlphaLower,
/// Uppercase alphabetic numbering, e.g. `A)`.
AlphaUpper,
/// Lowercase roman numbering, e.g. `iv)`.
RomanLower,
/// Uppercase roman numbering, e.g. `IV)`.
RomanUpper,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OrderedListStyle {
2022-11-28 15:52:09 -05:00
/// Number is followed by a period, e.g. `1.`.
Period,
/// Number is followed by a closing parenthesis, e.g. `1)`.
Paren,
/// Number is enclosed by parentheses, e.g. `(1)`.
ParenParen,
}
2022-11-29 12:34:13 -05:00
#[derive(Debug, PartialEq, Eq)]
pub enum Atom {
/// A horizontal ellipsis, i.e. a set of three periods.
2022-11-29 12:34:13 -05:00
Ellipsis,
/// An en dash.
EnDash,
/// An em dash.
EmDash,
/// A thematic break, typically a horizontal rule.
ThematicBreak,
/// A blank line.
Blankline,
/// A space that may not break a line.
NonBreakingSpace,
/// A newline that may or may not break a line in the output format.
Softbreak,
/// A newline that must break a line.
Hardbreak,
/// An escape character, not visible in output.
Escape,
}
2022-11-28 15:52:09 -05:00
impl<'s> Event<'s> {
fn from_inline(src: &'s str, inline: inline::Event) -> Self {
2022-11-28 18:33:43 -05:00
let content = inline.span.of(src);
match inline.kind {
inline::EventKind::Enter(c) | inline::EventKind::Exit(c) => {
let t = match c {
2022-11-29 12:34:13 -05:00
inline::Container::Span => Container::Span,
inline::Container::Subscript => Container::Subscript,
inline::Container::Superscript => Container::Superscript,
inline::Container::Insert => Container::Insert,
inline::Container::Delete => Container::Delete,
inline::Container::Emphasis => Container::Emphasis,
inline::Container::Strong => Container::Strong,
inline::Container::Mark => Container::Mark,
inline::Container::SingleQuoted => Container::SingleQuoted,
inline::Container::DoubleQuoted => Container::DoubleQuoted,
2022-11-28 18:33:43 -05:00
_ => todo!(),
};
if matches!(inline.kind, inline::EventKind::Enter(_)) {
Self::Start(t, Attributes::none())
} else {
Self::End(t)
}
}
2022-11-29 12:34:13 -05:00
inline::EventKind::Atom(a) => Event::Atom(match a {
inline::Atom::Ellipsis => Atom::Ellipsis,
inline::Atom::EnDash => Atom::EnDash,
inline::Atom::EmDash => Atom::EmDash,
inline::Atom::Nbsp => Atom::NonBreakingSpace,
inline::Atom::Softbreak => Atom::Softbreak,
inline::Atom::Hardbreak => Atom::Hardbreak,
inline::Atom::Escape => Atom::Escape,
2022-11-28 18:33:43 -05:00
_ => todo!(),
2022-11-29 12:34:13 -05:00
}),
2022-11-28 18:33:43 -05:00
inline::EventKind::Node(n) => match n {
inline::Node::Str => Self::Str(content),
inline::Node::Verbatim => Self::Verbatim(content),
inline::Node::InlineMath => Self::Math {
content,
display: false,
},
inline::Node::DisplayMath => Self::Math {
content,
display: true,
},
_ => todo!(),
},
}
}
}
2022-11-29 12:34:13 -05:00
impl<'s> Container<'s> {
2022-11-28 18:33:43 -05:00
fn from_block(src: &'s str, block: block::Block) -> Self {
match block {
block::Block::Leaf(l) => match l {
block::Leaf::Paragraph => Self::Paragraph,
block::Leaf::Heading { level } => Self::Heading { level },
block::Leaf::CodeBlock { .. } => Self::CodeBlock { lang: None },
2022-11-28 18:33:43 -05:00
_ => todo!(),
},
block::Block::Container(c) => match c {
block::Container::Blockquote => Self::Blockquote,
block::Container::Div { .. } => Self::Div,
block::Container::Footnote { .. } => Self::Footnote { tag: todo!() },
_ => todo!(),
},
2022-11-28 15:52:09 -05:00
}
}
2022-11-28 14:12:49 -05:00
}
2022-11-28 18:33:43 -05:00
// Attributes are rare, better to pay 8 bytes always and sometimes an extra allocation instead of
// always 24 bytes.
2022-11-22 13:19:21 -05:00
#[derive(Debug, PartialEq, Eq)]
2022-11-28 18:33:43 -05:00
pub struct Attributes<'s>(Option<Box<Vec<(&'s str, &'s str)>>>);
impl<'s> Attributes<'s> {
#[must_use]
pub fn none() -> Self {
Self(None)
}
#[must_use]
pub fn valid(src: &str) -> bool {
todo!()
}
#[must_use]
pub fn parse(src: &'s str) -> Self {
todo!()
}
2022-11-22 13:19:21 -05:00
}
2022-11-28 14:19:22 -05:00
pub struct Parser<'s> {
2022-11-20 13:13:48 -05:00
src: &'s str,
2022-11-28 14:19:22 -05:00
tree: block::Tree,
2022-11-22 13:19:21 -05:00
parser: Option<inline::Parser<'s>>,
2022-11-26 19:12:56 -05:00
inline_start: usize,
2022-11-20 13:13:48 -05:00
}
2022-11-28 14:19:22 -05:00
impl<'s> Parser<'s> {
#[must_use]
pub fn new(src: &'s str) -> Self {
Self {
src,
tree: block::parse(src),
parser: None,
inline_start: 0,
}
}
}
impl<'s> Iterator for Parser<'s> {
2022-11-28 18:33:43 -05:00
type Item = Event<'s>;
2022-11-20 13:13:48 -05:00
fn next(&mut self) -> Option<Self::Item> {
2022-11-22 13:19:21 -05:00
while let Some(parser) = &mut self.parser {
// inside leaf block, with inline content
2022-11-26 19:12:56 -05:00
if let Some(mut inline) = parser.next() {
2022-11-27 18:10:28 -05:00
inline.span = inline.span.translate(self.inline_start);
2022-11-28 18:33:43 -05:00
return Some(Event::from_inline(self.src, inline));
2022-11-22 13:19:21 -05:00
} else if let Some(ev) = self.tree.next() {
2022-11-28 14:12:49 -05:00
match ev.kind {
tree::EventKind::Element(atom) => {
assert_eq!(atom, block::Atom::Inline);
parser.parse(ev.span.of(self.src));
2022-11-22 13:19:21 -05:00
}
2022-11-28 18:33:43 -05:00
tree::EventKind::Exit(block) => {
2022-11-22 13:19:21 -05:00
self.parser = None;
2022-11-29 12:34:13 -05:00
return Some(Event::End(Container::from_block(self.src, block)));
2022-11-20 13:13:48 -05:00
}
2022-11-28 14:12:49 -05:00
tree::EventKind::Enter(..) => unreachable!(),
2022-11-22 13:19:21 -05:00
}
2022-11-20 13:13:48 -05:00
}
2022-11-22 13:19:21 -05:00
}
2022-11-28 14:12:49 -05:00
self.tree.next().map(|ev| match ev.kind {
tree::EventKind::Element(atom) => {
assert_eq!(atom, block::Atom::Blankline);
2022-11-29 12:34:13 -05:00
Event::Atom(Atom::Blankline)
2022-11-22 13:19:21 -05:00
}
2022-11-28 14:12:49 -05:00
tree::EventKind::Enter(block) => {
if matches!(block, block::Block::Leaf(l)) {
2022-11-26 19:12:56 -05:00
self.parser = Some(inline::Parser::new());
}
match block {
block::Block::Leaf(block::Leaf::Paragraph) => self.inline_start = ev.span.end(),
block::Block::Leaf(block::Leaf::CodeBlock { .. }) => {
let lang = self.tree.next().unwrap();
self.inline_start = lang.span.end();
let lang = (!lang.span.is_empty()).then(|| lang.span.of(self.src).trim());
return Event::Start(Container::CodeBlock { lang }, Attributes::none());
}
_ => {}
2022-11-26 19:12:56 -05:00
}
2022-11-29 12:34:13 -05:00
Event::Start(Container::from_block(self.src, block), Attributes::none())
2022-11-22 13:19:21 -05:00
}
2022-11-29 12:34:13 -05:00
tree::EventKind::Exit(block) => Event::End(Container::from_block(self.src, block)),
2022-11-20 13:13:48 -05:00
})
}
}
2022-11-22 13:19:21 -05:00
#[cfg(test)]
mod test {
2022-11-29 12:34:13 -05:00
use super::Atom::*;
2022-11-28 18:33:43 -05:00
use super::Attributes;
2022-11-29 12:34:13 -05:00
use super::Container::*;
2022-11-22 13:19:21 -05:00
use super::Event::*;
2022-11-22 13:48:17 -05:00
macro_rules! test_parse {
($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => {
#[allow(unused)]
2022-11-28 14:19:22 -05:00
let actual = super::Parser::new($src).collect::<Vec<_>>();
2022-11-22 13:48:17 -05:00
let expected = &[$($($token),*,)?];
2022-11-28 18:33:43 -05:00
assert_eq!(
actual,
expected,
concat!(
"\n",
"\x1b[0;1m====================== INPUT =========================\x1b[0m\n",
"\x1b[2m{}",
"\x1b[0;1m================ ACTUAL vs EXPECTED ==================\x1b[0m\n",
"{}",
"\x1b[0;1m======================================================\x1b[0m\n",
),
$src,
{
let a = actual.iter().map(|n| format!("{:?}", n)).collect::<Vec<_>>();
let b = expected.iter().map(|n| format!("{:?}", n)).collect::<Vec<_>>();
let max = a.len().max(b.len());
let a_width = a.iter().map(|a| a.len()).max().unwrap_or(0);
a.iter()
.map(AsRef::as_ref)
.chain(std::iter::repeat(""))
.zip(b.iter().map(AsRef::as_ref).chain(std::iter::repeat("")))
.take(max)
.map(|(a, b)|
format!(
"\x1b[{}m{:a_width$}\x1b[0m {}= \x1b[{}m{}\x1b[0m\n",
if a == b { "2" } else { "31" },
a,
if a == b { '=' } else { '!' },
if a == b { "2" } else { "32" },
b,
a_width = a_width,
)
)
.collect::<String>()
},
);
2022-11-22 13:48:17 -05:00
};
}
2022-11-22 13:19:21 -05:00
#[test]
2022-11-22 13:48:17 -05:00
fn para() {
test_parse!(
2022-11-26 19:12:56 -05:00
"para",
2022-11-28 18:33:43 -05:00
Start(Paragraph, Attributes::none()),
Str("para"),
End(Paragraph),
2022-11-26 19:12:56 -05:00
);
test_parse!(
"pa ra",
2022-11-28 18:33:43 -05:00
Start(Paragraph, Attributes::none()),
Str("pa ra"),
End(Paragraph),
2022-11-26 19:12:56 -05:00
);
test_parse!(
"para0\n\npara1",
2022-11-28 18:33:43 -05:00
Start(Paragraph, Attributes::none()),
2022-12-01 12:09:09 -05:00
Str("para0"),
2022-11-28 18:33:43 -05:00
End(Paragraph),
2022-11-29 12:34:13 -05:00
Atom(Blankline),
2022-11-28 18:33:43 -05:00
Start(Paragraph, Attributes::none()),
Str("para1"),
End(Paragraph),
2022-11-22 13:19:21 -05:00
);
}
}