PR #40 use bytes instead of chars

Merge branch 'ascii'
This commit is contained in:
Noah Hellman 2023-05-16 19:59:05 +02:00
commit e4569f5c3e
7 changed files with 559 additions and 629 deletions

View file

@ -8,26 +8,24 @@ pub(crate) fn parse(src: &str) -> Attributes {
a a
} }
pub fn valid<I: Iterator<Item = char>>(chars: I) -> (usize, bool) { pub fn valid(src: &str) -> usize {
use State::*; use State::*;
let mut has_attr = false;
let mut n = 0; let mut n = 0;
let mut state = Start; let mut state = Start;
for c in chars { for c in src.bytes() {
n += 1; n += 1;
state = state.step(c); state = state.step(c);
match state { match state {
Class | Identifier | Value | ValueQuoted => has_attr = true,
Done | Invalid => break, Done | Invalid => break,
_ => {} _ => {}
} }
} }
if matches!(state, Done) { if matches!(state, Done) {
(n, has_attr) n
} else { } else {
(0, false) 0
} }
} }
@ -258,11 +256,11 @@ impl Validator {
/// Returns number of valid bytes parsed (0 means invalid) if finished, otherwise more input is /// Returns number of valid bytes parsed (0 means invalid) if finished, otherwise more input is
/// needed. /// needed.
pub fn parse(&mut self, input: &str) -> Option<usize> { pub fn parse(&mut self, input: &str) -> Option<usize> {
let mut chars = input.chars(); let mut bytes = input.bytes();
for c in &mut chars { for c in &mut bytes {
self.state = self.state.step(c); self.state = self.state.step(c);
match self.state { match self.state {
State::Done => return Some(input.len() - chars.as_str().len()), State::Done => return Some(input.len() - bytes.len()),
State::Invalid => return Some(0), State::Invalid => return Some(0),
_ => {} _ => {}
} }
@ -299,7 +297,7 @@ impl<'s> Parser<'s> {
let mut pos = 0; let mut pos = 0;
let mut pos_prev = 0; let mut pos_prev = 0;
for c in input.chars() { for c in input.bytes() {
let state_next = self.state.step(c); let state_next = self.state.step(c);
let st = std::mem::replace(&mut self.state, state_next); let st = std::mem::replace(&mut self.state, state_next);
@ -320,7 +318,7 @@ impl<'s> Parser<'s> {
} }
}; };
pos += c.len_utf8(); pos += 1;
debug_assert!(!matches!(self.state, Invalid)); debug_assert!(!matches!(self.state, Invalid));
@ -360,40 +358,40 @@ enum State {
} }
impl State { impl State {
fn step(self, c: char) -> State { fn step(self, c: u8) -> State {
use State::*; use State::*;
match self { match self {
Start if c == '{' => Whitespace, Start if c == b'{' => Whitespace,
Start => Invalid, Start => Invalid,
Whitespace => match c { Whitespace => match c {
'}' => Done, b'}' => Done,
'.' => ClassFirst, b'.' => ClassFirst,
'#' => IdentifierFirst, b'#' => IdentifierFirst,
'%' => Comment, b'%' => Comment,
c if is_name(c) => Key, c if is_name(c) => Key,
c if c.is_whitespace() => Whitespace, c if c.is_ascii_whitespace() => Whitespace,
_ => Invalid, _ => Invalid,
}, },
Comment if c == '%' => Whitespace, Comment if c == b'%' => Whitespace,
Comment => Comment, Comment => Comment,
ClassFirst if is_name(c) => Class, ClassFirst if is_name(c) => Class,
ClassFirst => Invalid, ClassFirst => Invalid,
IdentifierFirst if is_name(c) => Identifier, IdentifierFirst if is_name(c) => Identifier,
IdentifierFirst => Invalid, IdentifierFirst => Invalid,
s @ (Class | Identifier | Value) if is_name(c) => s, s @ (Class | Identifier | Value) if is_name(c) => s,
Class | Identifier | Value if c.is_whitespace() => Whitespace, Class | Identifier | Value if c.is_ascii_whitespace() => Whitespace,
Class | Identifier | Value if c == '}' => Done, Class | Identifier | Value if c == b'}' => Done,
Class | Identifier | Value => Invalid, Class | Identifier | Value => Invalid,
Key if is_name(c) => Key, Key if is_name(c) => Key,
Key if c == '=' => ValueFirst, Key if c == b'=' => ValueFirst,
Key => Invalid, Key => Invalid,
ValueFirst if is_name(c) => Value, ValueFirst if is_name(c) => Value,
ValueFirst if c == '"' => ValueQuoted, ValueFirst if c == b'"' => ValueQuoted,
ValueFirst => Invalid, ValueFirst => Invalid,
ValueQuoted | ValueNewline | ValueContinued if c == '"' => Whitespace, ValueQuoted | ValueNewline | ValueContinued if c == b'"' => Whitespace,
ValueQuoted | ValueNewline | ValueContinued | ValueEscape if c == '\n' => ValueNewline, ValueQuoted | ValueNewline | ValueContinued | ValueEscape if c == b'\n' => ValueNewline,
ValueQuoted if c == '\\' => ValueEscape, ValueQuoted if c == b'\\' => ValueEscape,
ValueQuoted | ValueEscape => ValueQuoted, ValueQuoted | ValueEscape => ValueQuoted,
ValueNewline | ValueContinued => ValueContinued, ValueNewline | ValueContinued => ValueContinued,
Invalid | Done => panic!("{:?}", self), Invalid | Done => panic!("{:?}", self),
@ -401,8 +399,8 @@ impl State {
} }
} }
pub fn is_name(c: char) -> bool { pub fn is_name(c: u8) -> bool {
c.is_ascii_alphanumeric() || matches!(c, ':' | '_' | '-') c.is_ascii_alphanumeric() || matches!(c, b':' | b'_' | b'-')
} }
#[cfg(test)] #[cfg(test)]
@ -437,11 +435,6 @@ mod test {
test_attr!("{#a #b}", ("id", "b")); test_attr!("{#a #b}", ("id", "b"));
} }
#[test]
fn unicode_whitespace() {
test_attr!("{.a .b}", ("class", "a b"));
}
#[test] #[test]
fn value_unquoted() { fn value_unquoted() {
test_attr!( test_attr!(
@ -519,41 +512,45 @@ mod test {
#[test] #[test]
fn valid_full() { fn valid_full() {
let src = "{.class %comment%}"; let src = "{.class %comment%}";
assert_eq!(super::valid(src.chars()), (src.len(), true)); assert_eq!(super::valid(src), src.len());
}
#[test]
fn valid_unicode() {
let src = r#"{a="б"}"#;
assert_eq!(super::valid(src), src.len());
} }
#[test] #[test]
fn valid_empty() { fn valid_empty() {
let src = "{}"; let src = "{}";
assert_eq!(super::valid(src.chars()), (src.len(), false)); assert_eq!(super::valid(src), src.len());
} }
#[test] #[test]
fn valid_whitespace() { fn valid_whitespace() {
let src = "{ \n }"; let src = "{ \n }";
assert_eq!(super::valid(src.chars()), (src.len(), false)); assert_eq!(super::valid(src), src.len());
} }
#[test] #[test]
fn valid_comment() { fn valid_comment() {
let src = "{%comment%}"; let src = "{%comment%}";
assert_eq!(super::valid(src.chars()), (src.len(), false)); assert_eq!(super::valid(src), src.len());
} }
#[test] #[test]
fn valid_trailing() { fn valid_trailing() {
let src = "{.class}"; let src = "{.class}{.ignore}";
assert_eq!( let src_valid = "{.class}";
super::valid(src.chars().chain("{.ignore}".chars())), assert_eq!(super::valid(src), src_valid.len());
(src.len(), true),
);
} }
#[test] #[test]
fn valid_invalid() { fn valid_invalid() {
assert_eq!(super::valid(" {.valid}".chars()), (0, false)); assert_eq!(super::valid(" {.valid}"), 0);
assert_eq!(super::valid("{.class invalid}".chars()), (0, false)); assert_eq!(super::valid("{.class invalid}"), 0);
assert_eq!(super::valid("abc".chars()), (0, false)); assert_eq!(super::valid("abc"), 0);
assert_eq!(super::valid("{.abc.}".chars()), (0, false)); assert_eq!(super::valid("{.abc.}"), 0);
} }
} }

View file

@ -1,7 +1,8 @@
use std::ops::Range;
use crate::Alignment; use crate::Alignment;
use crate::OrderedListNumbering::*; use crate::OrderedListNumbering::*;
use crate::OrderedListStyle::*; use crate::OrderedListStyle::*;
use crate::Span;
use crate::attr; use crate::attr;
use crate::lex; use crate::lex;
@ -11,13 +12,13 @@ use Container::*;
use Leaf::*; use Leaf::*;
use ListType::*; use ListType::*;
#[derive(Debug, Copy, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub struct Event<'s> { pub struct Event<'s> {
pub kind: EventKind<'s>, pub kind: EventKind<'s>,
pub span: Span, pub span: Range<usize>,
} }
#[derive(Debug, Copy, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub enum EventKind<'s> { pub enum EventKind<'s> {
Enter(Node<'s>), Enter(Node<'s>),
Inline, Inline,
@ -173,20 +174,20 @@ impl<'s> TreeParser<'s> {
} }
for _ in std::mem::take(&mut self.open_sections).drain(..) { for _ in std::mem::take(&mut self.open_sections).drain(..) {
self.exit(Span::empty_at(self.src.len())); self.exit(self.src.len()..self.src.len());
} }
debug_assert_eq!(self.open, &[]); debug_assert_eq!(self.open, &[]);
self.events self.events
} }
fn inline(&mut self, span: Span) { fn inline(&mut self, span: Range<usize>) {
self.events.push(Event { self.events.push(Event {
kind: EventKind::Inline, kind: EventKind::Inline,
span, span,
}); });
} }
fn enter(&mut self, node: Node<'s>, span: Span) -> usize { fn enter(&mut self, node: Node<'s>, span: Range<usize>) -> usize {
let i = self.events.len(); let i = self.events.len();
self.open.push(i); self.open.push(i);
self.events.push(Event { self.events.push(Event {
@ -196,7 +197,7 @@ impl<'s> TreeParser<'s> {
i i
} }
fn exit(&mut self, span: Span) -> usize { fn exit(&mut self, span: Range<usize>) -> usize {
let i = self.events.len(); let i = self.events.len();
let node = if let EventKind::Enter(node) = self.events[self.open.pop().unwrap()].kind { let node = if let EventKind::Enter(node) = self.events[self.open.pop().unwrap()].kind {
node node
@ -211,29 +212,29 @@ impl<'s> TreeParser<'s> {
} }
/// Recursively parse a block and all of its children. Return number of lines the block uses. /// Recursively parse a block and all of its children. Return number of lines the block uses.
fn parse_block(&mut self, lines: &mut [Span], top_level: bool) -> usize { fn parse_block(&mut self, lines: &mut [Range<usize>], top_level: bool) -> usize {
if let Some(MeteredBlock { if let Some(MeteredBlock {
kind, kind,
span: span_start, span: span_start,
line_count, line_count,
}) = MeteredBlock::new(lines.iter().map(|sp| sp.of(self.src))) }) = MeteredBlock::new(lines.iter().map(|sp| &self.src[sp.clone()]))
{ {
let lines = &mut lines[..line_count]; let lines = &mut lines[..line_count];
let span_start = span_start.translate(lines[0].start()); let span_start = (span_start.start + lines[0].start)..(span_start.end + lines[0].start);
let end_line = lines[lines.len() - 1]; let end_line = lines[lines.len() - 1].clone();
let span_end = match kind { let span_end = match kind {
Kind::Fenced { Kind::Fenced {
has_closing_fence: true, has_closing_fence: true,
.. ..
} => end_line, } => end_line,
_ => end_line.empty_after(), _ => end_line.end..end_line.end,
}; };
// part of first inline that is from the outer block // part of first inline that is from the outer block
let outer = Span::new(lines[0].start(), span_start.end()); let outer = lines[0].start..span_start.end;
// skip outer block part for inner content // skip outer block part for inner content
lines[0] = lines[0].skip(outer.len()); lines[0].start += outer.len();
// skip opening and closing fence of code block / div // skip opening and closing fence of code block / div
let lines = if let Kind::Fenced { let lines = if let Kind::Fenced {
@ -253,7 +254,7 @@ impl<'s> TreeParser<'s> {
&& !matches!(kind, Kind::ListItem { ty: ty_new, .. } if *ty == ty_new) && !matches!(kind, Kind::ListItem { ty: ty_new, .. } if *ty == ty_new)
{ {
let l = self.open_lists.pop().unwrap(); let l = self.open_lists.pop().unwrap();
self.close_list(l, span_start.start()); self.close_list(l, span_start.start);
} }
} }
@ -287,7 +288,7 @@ impl<'s> TreeParser<'s> {
Kind::Heading { level } => Block::Leaf(Heading { Kind::Heading { level } => Block::Leaf(Heading {
level: level.try_into().unwrap(), level: level.try_into().unwrap(),
has_section: top_level, has_section: top_level,
pos: span_start.start() as u32, pos: span_start.start as u32,
}), }),
Kind::Fenced { Kind::Fenced {
kind: FenceKind::CodeBlock(..), kind: FenceKind::CodeBlock(..),
@ -312,7 +313,7 @@ impl<'s> TreeParser<'s> {
Kind::Blockquote => Block::Container(Blockquote), Kind::Blockquote => Block::Container(Blockquote),
Kind::ListItem { ty, .. } => Block::Container(ListItem(match ty { Kind::ListItem { ty, .. } => Block::Container(ListItem(match ty {
ListType::Task => ListItemKind::Task { ListType::Task => ListItemKind::Task {
checked: span_start.of(self.src).as_bytes()[3] != b' ', checked: self.src.as_bytes()[span_start.start + 3] != b' ',
}, },
ListType::Description => ListItemKind::Description, ListType::Description => ListItemKind::Description,
_ => ListItemKind::List, _ => ListItemKind::List,
@ -348,23 +349,22 @@ impl<'s> TreeParser<'s> {
&mut self, &mut self,
leaf: Leaf<'s>, leaf: Leaf<'s>,
k: &Kind, k: &Kind,
span_start: Span, span_start: Range<usize>,
span_end: Span, span_end: Range<usize>,
mut lines: &mut [Span], mut lines: &mut [Range<usize>],
) { ) {
if let Kind::Fenced { indent, .. } = k { if let Kind::Fenced { indent, .. } = k {
for line in lines.iter_mut() { for line in lines.iter_mut() {
let indent_line = line let indent_line = self.src.as_bytes()[line.clone()]
.of(self.src) .iter()
.chars() .take_while(|c| *c != &b'\n' && c.is_ascii_whitespace())
.take_while(|c| *c != '\n' && c.is_whitespace())
.count(); .count();
*line = line.skip_chars((*indent).min(indent_line), self.src); line.start += (*indent).min(indent_line);
} }
} else { } else {
// trim starting whitespace of each inline // trim starting whitespace of each inline
for line in lines.iter_mut() { for line in lines.iter_mut() {
*line = line.trim_start(self.src); *line = self.trim_start(line.clone());
} }
// skip first inline if empty // skip first inline if empty
@ -375,15 +375,14 @@ impl<'s> TreeParser<'s> {
if matches!(leaf, LinkDefinition { .. }) { if matches!(leaf, LinkDefinition { .. }) {
// trim ending whitespace of each inline // trim ending whitespace of each inline
for line in lines.iter_mut() { for line in lines.iter_mut() {
*line = line.trim_end(self.src); *line = self.trim_end(line.clone());
} }
} }
// trim ending whitespace of block // trim ending whitespace of block
let l = lines.len(); let l = lines.len();
if l > 0 { if l > 0 {
let last = &mut lines[l - 1]; lines[l - 1] = self.trim_end(lines[l - 1].clone());
*last = last.trim_end(self.src);
} }
} }
@ -398,7 +397,7 @@ impl<'s> TreeParser<'s> {
.iter() .iter()
.rposition(|l| l < level) .rposition(|l| l < level)
.map_or(0, |i| i + 1); .map_or(0, |i| i + 1);
let pos = span_start.start() as u32; let pos = span_start.start as u32;
for i in 0..(self.open_sections.len() - first_close) { for i in 0..(self.open_sections.len() - first_close) {
let node = if let EventKind::Enter(node) = let node = if let EventKind::Enter(node) =
self.events[self.open.pop().unwrap()].kind self.events[self.open.pop().unwrap()].kind
@ -409,23 +408,31 @@ impl<'s> TreeParser<'s> {
}; };
let end = self let end = self
.attr_start .attr_start
.map_or(span_start.start(), |a| self.events[a].span.start()); .map_or(span_start.start, |a| self.events[a].span.start);
self.events.insert( self.events.insert(
self.attr_start.map_or(self.events.len(), |a| a + i), self.attr_start.map_or(self.events.len(), |a| a + i),
Event { Event {
kind: EventKind::Exit(node), kind: EventKind::Exit(node),
span: Span::new(end, end), span: end..end,
}, },
); );
} }
self.open_sections.drain(first_close..); self.open_sections.drain(first_close..);
self.open_sections.push(*level); self.open_sections.push(*level);
self.enter(Node::Container(Section { pos }), span_start.empty_before()); self.enter(
Node::Container(Section { pos }),
span_start.start..span_start.start,
);
} }
// trim '#' characters // trim '#' characters
for line in lines.iter_mut().skip(1) { for line in lines.iter_mut().skip(1) {
*line = line.trim_start_matches(self.src, |c| c == '#' || c.is_whitespace()); let start = line.start
+ self.src.as_bytes()[line.clone()]
.iter()
.take_while(|c| **c == b'#' || c.is_ascii_whitespace())
.count();
line.start = start;
} }
} }
@ -433,7 +440,7 @@ impl<'s> TreeParser<'s> {
lines lines
.iter() .iter()
.filter(|l| !matches!(k, Kind::Heading { .. }) || !l.is_empty()) .filter(|l| !matches!(k, Kind::Heading { .. }) || !l.is_empty())
.for_each(|line| self.inline(*line)); .for_each(|line| self.inline(line.clone()));
self.exit(span_end); self.exit(span_end);
} }
@ -441,36 +448,37 @@ impl<'s> TreeParser<'s> {
&mut self, &mut self,
c: Container<'s>, c: Container<'s>,
k: &Kind, k: &Kind,
mut span_start: Span, mut span_start: Range<usize>,
span_end: Span, span_end: Range<usize>,
outer: Span, outer: Range<usize>,
lines: &mut [Span], lines: &mut [Range<usize>],
) { ) {
// update spans, remove indentation / container prefix // update spans, remove indentation / container prefix
lines.iter_mut().skip(1).for_each(|sp| { lines.iter_mut().skip(1).for_each(|sp| {
let src = sp.of(self.src); let src = &self.src[sp.clone()];
let src_t = src.trim(); let src_t = src.trim_matches(|c: char| c.is_ascii_whitespace());
let spaces = src.chars().take_while(|c| c.is_whitespace()).count(); let whitespace = src_t.as_ptr() as usize - src.as_ptr() as usize;
let skip = match k { let skip = match k {
Kind::Blockquote => { Kind::Blockquote => {
if src_t == ">" { if src_t == ">" {
spaces + 1 whitespace + 1
} else if src_t.starts_with('>') } else if src_t.starts_with('>')
&& src_t.chars().nth(1).map_or(false, char::is_whitespace) && src_t[1..].starts_with(|c: char| c.is_ascii_whitespace())
{ {
spaces + 1 + usize::from(src_t.len() > 1) whitespace + 1 + usize::from(src_t.len() > 1)
} else { } else {
0 0
} }
} }
Kind::ListItem { .. } | Kind::Definition { .. } => { Kind::ListItem { .. } | Kind::Definition { .. } => whitespace.min(outer.len()),
spaces.min(outer.of(self.src).chars().count()) Kind::Fenced { indent, .. } => whitespace.min(*indent),
}
Kind::Fenced { indent, .. } => spaces.min(*indent),
_ => panic!("non-container {:?}", k), _ => panic!("non-container {:?}", k),
}; };
let count = sp.of(self.src).chars().take_while(|c| *c != '\n').count(); let len = self.src.as_bytes()[sp.clone()]
*sp = sp.skip_chars(skip.min(count), self.src); .iter()
.take_while(|c| **c != b'\n')
.count();
sp.start += skip.min(len);
}); });
if let Kind::ListItem { ty, .. } = k { if let Kind::ListItem { ty, .. } = k {
@ -485,9 +493,9 @@ impl<'s> TreeParser<'s> {
let event = self.enter( let event = self.enter(
Node::Container(Container::List { Node::Container(Container::List {
kind: ListKind { ty: *ty, tight }, kind: ListKind { ty: *ty, tight },
marker: span_start.of(self.src), marker: &self.src[span_start.clone()],
}), }),
span_start.empty_before(), span_start.start..span_start.start,
); );
self.open_lists.push(OpenList { self.open_lists.push(OpenList {
ty: *ty, ty: *ty,
@ -498,9 +506,10 @@ impl<'s> TreeParser<'s> {
} }
let dt = if let ListItem(ListItemKind::Description) = c { let dt = if let ListItem(ListItemKind::Description) = c {
let dt = self.enter(Node::Leaf(DescriptionTerm), span_start); let dt = self.enter(Node::Leaf(DescriptionTerm), span_start.clone());
self.exit(span_start.trim_end(self.src).empty_after()); let start = self.trim_end(span_start.clone()).end;
span_start = lines[0].empty_before(); self.exit(start..start);
span_start = lines[0].start..lines[0].start;
Some((dt, self.events.len(), self.open.len())) Some((dt, self.events.len(), self.open.len()))
} else { } else {
None None
@ -537,7 +546,7 @@ impl<'s> TreeParser<'s> {
self.events[empty_term + 1].kind = EventKind::Stale; self.events[empty_term + 1].kind = EventKind::Stale;
// move out term before detail // move out term before detail
self.events[enter_term].span = self.events[empty_term].span; self.events[enter_term].span = self.events[empty_term].span.clone();
let first_detail = self.events[exit_term + 1..] let first_detail = self.events[exit_term + 1..]
.iter() .iter()
.position(|e| !matches!(e.kind, EventKind::Atom(Blankline))) .position(|e| !matches!(e.kind, EventKind::Atom(Blankline)))
@ -546,13 +555,14 @@ impl<'s> TreeParser<'s> {
let detail_pos = self let detail_pos = self
.events .events
.get(first_detail) .get(first_detail)
.map(|e| e.span.start()) .map(|e| e.span.start)
.unwrap_or_else(|| self.events.last().unwrap().span.end()); .unwrap_or_else(|| self.events.last().unwrap().span.end);
self.events for (i, j) in (enter_term..first_detail).enumerate() {
.copy_within(enter_term..first_detail, enter_detail); self.events[enter_detail + i] = self.events[j].clone();
}
self.events[first_detail - 1] = Event { self.events[first_detail - 1] = Event {
kind: EventKind::Enter(Node::Container(c)), kind: EventKind::Enter(Node::Container(c)),
span: Span::empty_at(detail_pos), span: detail_pos..detail_pos,
}; };
self.open[open_detail] = first_detail - 1; self.open[open_detail] = first_detail - 1;
} }
@ -565,44 +575,54 @@ impl<'s> TreeParser<'s> {
self.prev_blankline = false; self.prev_blankline = false;
self.prev_loose = false; self.prev_loose = false;
let l = self.open_lists.pop().unwrap(); let l = self.open_lists.pop().unwrap();
self.close_list(l, span_end.start()); self.close_list(l, span_end.start);
} }
} }
self.exit(span_end); self.exit(span_end);
} }
fn parse_table(&mut self, lines: &mut [Span], span_start: Span, span_end: Span) { fn parse_table(
&mut self,
lines: &mut [Range<usize>],
span_start: Range<usize>,
span_end: Range<usize>,
) {
self.alignments.clear(); self.alignments.clear();
self.enter(Node::Container(Table), span_start); self.enter(Node::Container(Table), span_start.clone());
let caption_line = lines let caption_line = lines
.iter() .iter()
.position(|sp| sp.of(self.src).trim_start().starts_with('^')) .position(|sp| {
self.src[sp.clone()]
.trim_start_matches(|c: char| c.is_ascii_whitespace())
.starts_with('^')
})
.map_or(lines.len(), |caption_line| { .map_or(lines.len(), |caption_line| {
self.enter(Node::Leaf(Caption), span_start); self.enter(Node::Leaf(Caption), span_start.clone());
lines[caption_line] = lines[caption_line] lines[caption_line] = self.trim_start(lines[caption_line].clone());
.trim_start(self.src) lines[caption_line].start += 2;
.skip_chars(2, self.src); lines[lines.len() - 1] = self.trim_end(lines[lines.len() - 1].clone());
lines[lines.len() - 1] = lines[lines.len() - 1].trim_end(self.src);
for line in &lines[caption_line..] { for line in &lines[caption_line..] {
self.inline(*line); self.inline(line.clone());
} }
self.exit(span_end); self.exit(span_end.clone());
caption_line caption_line
}); });
let mut last_row_event = None; let mut last_row_event = None;
for row in &lines[..caption_line] { for row in &lines[..caption_line] {
let row = row.trim(self.src); let row = self.trim(row.clone());
if row.is_empty() { if row.is_empty() {
break; break;
} }
let row_event_enter = let row_event_enter = self.enter(
self.enter(Node::Container(TableRow { head: false }), row.with_len(1)); Node::Container(TableRow { head: false }),
let rem = row.skip(1); // | row.start..(row.start + 1),
let lex = lex::Lexer::new(rem.of(self.src)); );
let mut pos = rem.start(); let rem = (row.start + 1)..row.end; // |
let lex = lex::Lexer::new(&self.src.as_bytes()[rem.clone()]);
let mut pos = rem.start;
let mut cell_start = pos; let mut cell_start = pos;
let mut separator_row = true; let mut separator_row = true;
let mut verbatim = None; let mut verbatim = None;
@ -615,8 +635,8 @@ impl<'s> TreeParser<'s> {
} else { } else {
match kind { match kind {
lex::Kind::Sym(lex::Symbol::Pipe) => { lex::Kind::Sym(lex::Symbol::Pipe) => {
let span = Span::new(cell_start, pos).trim(self.src); let span = self.trim(cell_start..pos);
let cell = span.of(self.src); let cell = &self.src[span.clone()];
let separator_cell = match cell.len() { let separator_cell = match cell.len() {
0 => false, 0 => false,
1 => cell == "-", 1 => cell == "-",
@ -624,7 +644,7 @@ impl<'s> TreeParser<'s> {
l => { l => {
matches!(cell.as_bytes()[0], b'-' | b':') matches!(cell.as_bytes()[0], b'-' | b':')
&& matches!(cell.as_bytes()[l - 1], b'-' | b':') && matches!(cell.as_bytes()[l - 1], b'-' | b':')
&& cell.chars().skip(1).take(l - 2).all(|c| c == '-') && cell.bytes().skip(1).take(l - 2).all(|c| c == b'-')
} }
}; };
separator_row &= separator_cell; separator_row &= separator_cell;
@ -635,10 +655,10 @@ impl<'s> TreeParser<'s> {
.copied() .copied()
.unwrap_or(Alignment::Unspecified), .unwrap_or(Alignment::Unspecified),
)), )),
Span::empty_at(cell_start), cell_start..cell_start,
); );
self.inline(span); self.inline(span);
self.exit(Span::new(pos, pos + 1)); self.exit(pos..(pos + 1));
cell_start = pos + len; cell_start = pos + len;
column_index += 1; column_index += 1;
} }
@ -658,7 +678,7 @@ impl<'s> TreeParser<'s> {
.iter() .iter()
.filter(|e| matches!(e.kind, EventKind::Inline)) .filter(|e| matches!(e.kind, EventKind::Inline))
.map(|e| { .map(|e| {
let cell = e.span.of(self.src); let cell = &self.src[e.span.clone()];
let l = cell.as_bytes()[0] == b':'; let l = cell.as_bytes()[0] == b':';
let r = cell.as_bytes()[cell.len() - 1] == b':'; let r = cell.as_bytes()[cell.len() - 1] == b':';
match (l, r) { match (l, r) {
@ -709,7 +729,7 @@ impl<'s> TreeParser<'s> {
} }
} }
} else { } else {
let row_event_exit = self.exit(Span::empty_at(pos)); // table row let row_event_exit = self.exit(pos..pos); // table row
last_row_event = Some((row_event_enter, row_event_exit)); last_row_event = Some((row_event_enter, row_event_exit));
} }
} }
@ -729,14 +749,30 @@ impl<'s> TreeParser<'s> {
} }
} }
self.exit(Span::empty_at(pos)); // list self.exit(pos..pos); // list
}
fn trim_start(&self, sp: Range<usize>) -> Range<usize> {
let s = self.src[sp].trim_start_matches(|c: char| c.is_ascii_whitespace());
(s.as_ptr() as usize - self.src.as_ptr() as usize)
..(s.as_ptr() as usize + s.len() - self.src.as_ptr() as usize)
}
fn trim_end(&self, sp: Range<usize>) -> Range<usize> {
let s = self.src[sp].trim_end_matches(|c: char| c.is_ascii_whitespace());
(s.as_ptr() as usize - self.src.as_ptr() as usize)
..(s.as_ptr() as usize + s.len() - self.src.as_ptr() as usize)
}
fn trim(&self, sp: Range<usize>) -> Range<usize> {
self.trim_end(self.trim_start(sp))
} }
} }
/// Parser for a single block. /// Parser for a single block.
struct MeteredBlock<'s> { struct MeteredBlock<'s> {
kind: Kind<'s>, kind: Kind<'s>,
span: Span, span: Range<usize>,
line_count: usize, line_count: usize,
} }
@ -794,53 +830,52 @@ enum Kind<'s> {
struct IdentifiedBlock<'s> { struct IdentifiedBlock<'s> {
kind: Kind<'s>, kind: Kind<'s>,
span: Span, span: Range<usize>,
} }
impl<'s> IdentifiedBlock<'s> { impl<'s> IdentifiedBlock<'s> {
fn new(line: &'s str) -> Self { fn new(line: &'s str) -> Self {
let mut chars = line.chars(); let l = line.len();
let indent = chars
.clone() let line = line.trim_start_matches(|c: char| c.is_ascii_whitespace() && c != '\n');
.take_while(|c| *c != '\n' && c.is_whitespace()) let indent = l - line.len();
.count(); let line_t = line.trim_end_matches(|c: char| c.is_ascii_whitespace());
(&mut chars).take(indent).last();
let indent_bytes = line.len() - chars.as_str().len();
let line = chars.as_str();
let line_t = line.trim_end();
let l = line.len(); let l = line.len();
let lt = line_t.len(); let lt = line_t.len();
let mut chars = line.chars();
let first = if let Some(c) = chars.next() { let first = if let Some(c) = chars.next() {
c c
} else { } else {
return Self { return Self {
kind: Kind::Atom(Blankline), kind: Kind::Atom(Blankline),
span: Span::empty_at(indent_bytes), span: indent..indent,
}; };
}; };
match first { match first {
'\n' => Some((Kind::Atom(Blankline), Span::by_len(indent_bytes, 1))), '\n' => Some((Kind::Atom(Blankline), indent..(indent + 1))),
'#' => chars '#' => chars
.find(|c| *c != '#') .find(|c| *c != '#')
.map_or(true, char::is_whitespace) .map_or(true, |c| c.is_ascii_whitespace())
.then(|| { .then(|| {
let level = line.chars().take_while(|c| *c == '#').count(); let level = line.bytes().take_while(|c| *c == b'#').count();
(Kind::Heading { level }, Span::by_len(indent_bytes, level)) (Kind::Heading { level }, indent..(indent + level))
}), }),
'>' => { '>' => {
if chars.next().map_or(true, char::is_whitespace) { if chars.next().map_or(true, |c| c.is_ascii_whitespace()) {
Some((Kind::Blockquote, Span::by_len(indent_bytes, 1))) Some((Kind::Blockquote, indent..(indent + 1)))
} else { } else {
None None
} }
} }
'{' => (attr::valid(line.chars()).0 == lt) '{' => {
.then(|| (Kind::Atom(Attributes), Span::by_len(indent_bytes, l))), (attr::valid(line) == lt).then(|| (Kind::Atom(Attributes), indent..(indent + l)))
}
'|' => { '|' => {
if lt >= 2 && line_t.ends_with('|') && !line_t.ends_with("\\|") { if lt >= 2 && line_t.ends_with('|') && !line_t.ends_with("\\|") {
Some((Kind::Table { caption: false }, Span::empty_at(indent_bytes))) Some((Kind::Table { caption: false }, indent..indent))
} else { } else {
None None
} }
@ -854,17 +889,17 @@ impl<'s> IdentifiedBlock<'s> {
footnote, footnote,
label: &label[usize::from(footnote)..], label: &label[usize::from(footnote)..],
}, },
Span::by_len(0, indent_bytes + 3 + l), 0..(indent + 3 + l),
) )
}), }),
'-' | '*' if Self::is_thematic_break(chars.clone()) => { '-' | '*' if Self::is_thematic_break(chars.clone()) => {
Some((Kind::Atom(ThematicBreak), Span::by_len(indent_bytes, lt))) Some((Kind::Atom(ThematicBreak), indent..(indent + lt)))
} }
b @ ('-' | '*' | '+') => chars.next().map_or(true, |c| c == ' ').then(|| { b @ ('-' | '*' | '+') => chars.next().map_or(true, |c| c == ' ').then(|| {
let task_list = chars.next() == Some('[') let task_list = chars.next() == Some('[')
&& matches!(chars.next(), Some('x' | 'X' | ' ')) && matches!(chars.next(), Some('x' | 'X' | ' '))
&& chars.next() == Some(']') && chars.next() == Some(']')
&& chars.next().map_or(true, char::is_whitespace); && chars.next().map_or(true, |c| c.is_ascii_whitespace());
if task_list { if task_list {
( (
Kind::ListItem { Kind::ListItem {
@ -872,7 +907,7 @@ impl<'s> IdentifiedBlock<'s> {
ty: Task, ty: Task,
last_blankline: false, last_blankline: false,
}, },
Span::by_len(indent_bytes, 5), indent..(indent + 5),
) )
} else { } else {
( (
@ -881,25 +916,33 @@ impl<'s> IdentifiedBlock<'s> {
ty: Unordered(b as u8), ty: Unordered(b as u8),
last_blankline: false, last_blankline: false,
}, },
Span::by_len(indent_bytes, 1), indent..(indent + 1),
) )
} }
}), }),
':' if chars.clone().next().map_or(true, char::is_whitespace) => Some(( ':' if chars
.clone()
.next()
.map_or(true, |c| c.is_ascii_whitespace()) =>
{
Some((
Kind::ListItem { Kind::ListItem {
indent, indent,
ty: Description, ty: Description,
last_blankline: false, last_blankline: false,
}, },
Span::by_len(indent_bytes, 1), indent..(indent + 1),
)), ))
}
f @ ('`' | ':' | '~') => { f @ ('`' | ':' | '~') => {
let fence_length = 1 + (&mut chars).take_while(|c| *c == f).count(); let fence_length = 1 + (&mut chars).take_while(|c| *c == f).count();
let spec = &line_t[fence_length..].trim_start(); let spec =
&line_t[fence_length..].trim_start_matches(|c: char| c.is_ascii_whitespace());
let valid_spec = if f == ':' { let valid_spec = if f == ':' {
spec.chars().all(attr::is_name) spec.bytes().all(attr::is_name)
} else { } else {
!spec.chars().any(char::is_whitespace) && !spec.chars().any(|c| c == '`') !spec.bytes().any(|c| c.is_ascii_whitespace())
&& !spec.bytes().any(|c| c == b'`')
}; };
(valid_spec && fence_length >= 3).then(|| { (valid_spec && fence_length >= 3).then(|| {
( (
@ -913,7 +956,7 @@ impl<'s> IdentifiedBlock<'s> {
spec, spec,
has_closing_fence: false, has_closing_fence: false,
}, },
Span::by_len(indent_bytes, line.len()), indent..(indent + line.len()),
) )
}) })
} }
@ -924,14 +967,14 @@ impl<'s> IdentifiedBlock<'s> {
ty: Ordered(num, style), ty: Ordered(num, style),
last_blankline: false, last_blankline: false,
}, },
Span::by_len(indent_bytes, len), indent..(indent + len),
) )
}), }),
} }
.map(|(kind, span)| Self { kind, span }) .map(|(kind, span)| Self { kind, span })
.unwrap_or(Self { .unwrap_or(Self {
kind: Kind::Paragraph, kind: Kind::Paragraph,
span: Span::empty_at(indent_bytes), span: indent..indent,
}) })
} }
@ -940,7 +983,7 @@ impl<'s> IdentifiedBlock<'s> {
for c in chars { for c in chars {
if matches!(c, '-' | '*') { if matches!(c, '-' | '*') {
n += 1; n += 1;
} else if !c.is_whitespace() { } else if !c.is_ascii_whitespace() {
return false; return false;
} }
} }
@ -1023,7 +1066,7 @@ impl<'s> IdentifiedBlock<'s> {
numbering numbering
}; };
if chars.next().map_or(true, char::is_whitespace) { if chars.next().map_or(true, |c| c.is_ascii_whitespace()) {
Some((numbering, style, len_num + len_style)) Some((numbering, style, len_num + len_style))
} else { } else {
None None
@ -1054,18 +1097,19 @@ impl<'s> Kind<'s> {
last_blankline, last_blankline,
.. ..
} => { } => {
let spaces = line.chars().take_while(|c| c.is_whitespace()).count(); let line_t = line.trim_start_matches(|c: char| c.is_ascii_whitespace());
let whitespace = line.len() - line_t.len();
let para = !*last_blankline && matches!(next, Self::Paragraph); let para = !*last_blankline && matches!(next, Self::Paragraph);
let blankline = matches!(next, Self::Atom(Blankline)); *last_blankline = matches!(next, Self::Atom(Blankline));
*last_blankline = blankline; *last_blankline || whitespace > *indent || para
blankline || spaces > *indent || para
} }
Self::Definition { Self::Definition {
indent, footnote, .. indent, footnote, ..
} => { } => {
if *footnote { if *footnote {
let spaces = line.chars().take_while(|c| c.is_whitespace()).count(); let line_t = line.trim_start_matches(|c: char| c.is_ascii_whitespace());
matches!(next, Self::Atom(Blankline)) || spaces > *indent let whitespace = line.len() - line_t.len();
matches!(next, Self::Atom(Blankline)) || whitespace > *indent
} else { } else {
line.starts_with(' ') && !matches!(next, Self::Atom(Blankline)) line.starts_with(' ') && !matches!(next, Self::Atom(Blankline))
} }
@ -1093,7 +1137,10 @@ impl<'s> Kind<'s> {
} }
Self::Table { caption } => { Self::Table { caption } => {
matches!(next, Self::Table { .. } | Self::Atom(Blankline)) || { matches!(next, Self::Table { .. } | Self::Atom(Blankline)) || {
if line.trim().starts_with("^ ") { if line
.trim_matches(|c: char| c.is_ascii_whitespace())
.starts_with("^ ")
{
*caption = true; *caption = true;
true true
} else { } else {
@ -1106,7 +1153,7 @@ impl<'s> Kind<'s> {
} }
/// Similar to `std::str::split('\n')` but newline is included and spans are used instead of `str`. /// Similar to `std::str::split('\n')` but newline is included and spans are used instead of `str`.
fn lines(src: &str) -> impl Iterator<Item = Span> + '_ { fn lines(src: &str) -> impl Iterator<Item = Range<usize>> + '_ {
let mut chars = src.chars(); let mut chars = src.chars();
std::iter::from_fn(move || { std::iter::from_fn(move || {
if chars.as_str().is_empty() { if chars.as_str().is_empty() {
@ -1118,7 +1165,7 @@ fn lines(src: &str) -> impl Iterator<Item = Span> + '_ {
if start == end { if start == end {
None None
} else { } else {
Some(Span::new(start, end)) Some(start..end)
} }
} }
}) })
@ -1144,7 +1191,7 @@ mod test {
macro_rules! test_parse { macro_rules! test_parse {
($src:expr $(,$($event:expr),* $(,)?)?) => { ($src:expr $(,$($event:expr),* $(,)?)?) => {
let t = super::TreeParser::new($src).parse(); let t = super::TreeParser::new($src).parse();
let actual = t.into_iter().map(|ev| (ev.kind, ev.span.of($src))).collect::<Vec<_>>(); let actual = t.into_iter().map(|ev| (ev.kind, &$src[ev.span])).collect::<Vec<_>>();
let expected = &[$($($event),*,)?]; let expected = &[$($($event),*,)?];
assert_eq!( assert_eq!(
actual, actual,
@ -2734,10 +2781,10 @@ mod test {
macro_rules! test_block { macro_rules! test_block {
($src:expr, $kind:expr, $str:expr, $len:expr $(,)?) => { ($src:expr, $kind:expr, $str:expr, $len:expr $(,)?) => {
let lines = super::lines($src).map(|sp| sp.of($src)); let lines = super::lines($src).map(|sp| &$src[sp]);
let mb = super::MeteredBlock::new(lines).unwrap(); let mb = super::MeteredBlock::new(lines).unwrap();
assert_eq!( assert_eq!(
(mb.kind, mb.span.of($src), mb.line_count), (mb.kind, &$src[mb.span], mb.line_count),
($kind, $str, $len), ($kind, $str, $len),
"\n\n{}\n\n", "\n\n{}\n\n",
$src $src

View file

@ -1,7 +1,8 @@
use std::ops::Range;
use crate::attr; use crate::attr;
use crate::lex; use crate::lex;
use crate::CowStr; use crate::CowStr;
use crate::Span;
use lex::Delimiter; use lex::Delimiter;
use lex::Sequence; use lex::Sequence;
@ -72,7 +73,7 @@ type AttributesIndex = u32;
#[derive(Clone, Debug, PartialEq, Eq)] #[derive(Clone, Debug, PartialEq, Eq)]
pub struct Event<'s> { pub struct Event<'s> {
pub kind: EventKind<'s>, pub kind: EventKind<'s>,
pub span: Span, pub span: Range<usize>,
} }
#[derive(Clone)] #[derive(Clone)]
@ -83,26 +84,26 @@ struct Input<'s> {
/// The block is complete, the final line has been provided. /// The block is complete, the final line has been provided.
complete: bool, complete: bool,
/// Span of current line. /// Span of current line.
span_line: Span, span_line: Range<usize>,
/// Upcoming lines within the current block. /// Upcoming lines within the current block.
ahead: std::collections::VecDeque<Span>, ahead: std::collections::VecDeque<Range<usize>>,
/// Span of current event. /// Span of current event.
span: Span, span: Range<usize>,
} }
impl<'s> Input<'s> { impl<'s> Input<'s> {
fn new(src: &'s str) -> Self { fn new(src: &'s str) -> Self {
Self { Self {
src, src,
lexer: lex::Lexer::new(""), lexer: lex::Lexer::new(b""),
complete: false, complete: false,
span_line: Span::new(0, 0), span_line: 0..0,
ahead: std::collections::VecDeque::new(), ahead: std::collections::VecDeque::new(),
span: Span::empty_at(0), span: 0..0,
} }
} }
fn feed_line(&mut self, line: Span, last: bool) { fn feed_line(&mut self, line: Range<usize>, last: bool) {
debug_assert!(!self.complete); debug_assert!(!self.complete);
self.complete = last; self.complete = last;
if self.lexer.ahead().is_empty() { if self.lexer.ahead().is_empty() {
@ -117,14 +118,14 @@ impl<'s> Input<'s> {
} }
} }
fn set_current_line(&mut self, line: Span) { fn set_current_line(&mut self, line: Range<usize>) {
self.lexer = lex::Lexer::new(line.of(self.src)); self.lexer = lex::Lexer::new(&self.src.as_bytes()[line.clone()]);
self.span = line.start..line.start;
self.span_line = line; self.span_line = line;
self.span = line.empty_before();
} }
fn reset(&mut self) { fn reset(&mut self) {
self.lexer = lex::Lexer::new(""); self.lexer = lex::Lexer::new(b"");
self.complete = false; self.complete = false;
self.ahead.clear(); self.ahead.clear();
} }
@ -136,7 +137,7 @@ impl<'s> Input<'s> {
fn eat(&mut self) -> Option<lex::Token> { fn eat(&mut self) -> Option<lex::Token> {
let tok = self.lexer.next(); let tok = self.lexer.next();
if let Some(t) = &tok { if let Some(t) = &tok {
self.span = self.span.extend(t.len); self.span.end += t.len;
} }
tok tok
} }
@ -146,29 +147,30 @@ impl<'s> Input<'s> {
} }
fn reset_span(&mut self) { fn reset_span(&mut self) {
self.span = self.span.empty_after(); self.span.start = self.span.end;
} }
fn ahead_raw_format(&mut self) -> Option<Span> { fn ahead_raw_format(&mut self) -> Option<Range<usize>> {
if matches!( if matches!(
self.lexer.peek().map(|t| &t.kind), self.lexer.peek().map(|t| &t.kind),
Some(lex::Kind::Open(Delimiter::BraceEqual)) Some(lex::Kind::Open(Delimiter::BraceEqual))
) { ) {
let mut ahead = self.lexer.ahead().chars();
let mut end = false; let mut end = false;
let len = (&mut ahead) let len = self
.lexer
.ahead()
.iter()
.skip(2) // {= .skip(2) // {=
.take_while(|c| { .take_while(|c| {
if *c == '{' { if **c == b'{' {
return false; return false;
} }
if *c == '}' { if **c == b'}' {
end = true; end = true;
}; };
!end && !c.is_whitespace() !end && !c.is_ascii_whitespace()
}) })
.map(char::len_utf8) .count();
.sum();
(len > 0 && end).then(|| { (len > 0 && end).then(|| {
let tok = self.eat(); let tok = self.eat();
debug_assert_eq!( debug_assert_eq!(
@ -178,8 +180,8 @@ impl<'s> Input<'s> {
len: 2, len: 2,
}) })
); );
self.lexer = lex::Lexer::new(ahead.as_str()); self.lexer.skip_ahead(len + 1);
self.span.after(len) self.span.end..(self.span.end + len)
}) })
} else { } else {
None None
@ -252,7 +254,7 @@ impl<'s> Parser<'s> {
} }
} }
pub fn feed_line(&mut self, line: Span, last: bool) { pub fn feed_line(&mut self, line: Range<usize>, last: bool) {
self.input.feed_line(line, last); self.input.feed_line(line, last);
} }
@ -266,13 +268,13 @@ impl<'s> Parser<'s> {
self.store_attributes.clear(); self.store_attributes.clear();
} }
fn push_sp(&mut self, kind: EventKind<'s>, span: Span) -> Option<ControlFlow> { fn push_sp(&mut self, kind: EventKind<'s>, span: Range<usize>) -> Option<ControlFlow> {
self.events.push_back(Event { kind, span }); self.events.push_back(Event { kind, span });
Some(Continue) Some(Continue)
} }
fn push(&mut self, kind: EventKind<'s>) -> Option<ControlFlow> { fn push(&mut self, kind: EventKind<'s>) -> Option<ControlFlow> {
self.push_sp(kind, self.input.span) self.push_sp(kind, self.input.span.clone())
} }
fn parse_event(&mut self) -> ControlFlow { fn parse_event(&mut self) -> ControlFlow {
@ -308,11 +310,11 @@ impl<'s> Parser<'s> {
&& matches!(first.kind, lex::Kind::Seq(Sequence::Backtick)) && matches!(first.kind, lex::Kind::Seq(Sequence::Backtick))
{ {
let raw_format = self.input.ahead_raw_format(); let raw_format = self.input.ahead_raw_format();
if let Some(span_format) = raw_format { if let Some(span_format) = raw_format.clone() {
self.events[event_opener].kind = EventKind::Enter(RawFormat { self.events[event_opener].kind = EventKind::Enter(RawFormat {
format: span_format.of(self.input.src), format: &self.input.src[span_format.clone()],
}); });
self.input.span = Span::new(self.input.span.start(), span_format.end() + 1); self.input.span.end = span_format.end + 1;
}; };
let ty_opener = if let EventKind::Enter(ty) = self.events[event_opener].kind { let ty_opener = if let EventKind::Enter(ty) = self.events[event_opener].kind {
debug_assert!(matches!( debug_assert!(matches!(
@ -345,12 +347,9 @@ impl<'s> Parser<'s> {
} }
} else { } else {
// continue verbatim // continue verbatim
let is_whitespace = self let is_whitespace = self.input.src.as_bytes()[self.input.span.clone()]
.input .iter()
.span .all(|b| b.is_ascii_whitespace());
.of(self.input.src)
.chars()
.all(char::is_whitespace);
if is_whitespace { if is_whitespace {
if !*non_whitespace_encountered if !*non_whitespace_encountered
&& self.input.peek().map_or(false, |t| { && self.input.peek().map_or(false, |t| {
@ -374,19 +373,19 @@ impl<'s> Parser<'s> {
let ty = if let Some(sp) = self let ty = if let Some(sp) = self
.events .events
.back() .back()
.and_then(|e| matches!(&e.kind, EventKind::Str).then(|| e.span)) .and_then(|e| matches!(&e.kind, EventKind::Str).then(|| e.span.clone()))
.filter(|sp| { .filter(|sp| {
sp.end() == self.input.span.start() sp.end == self.input.span.start
&& sp.of(self.input.src).as_bytes()[sp.len() - 1] == b'$' && self.input.src.as_bytes()[sp.start + sp.len() - 1] == b'$'
&& sp && sp
.end() .end
.checked_sub(2) .checked_sub(2)
.map_or(true, |i| self.input.src.as_bytes()[i] != b'\\') .map_or(true, |i| self.input.src.as_bytes()[i] != b'\\')
}) { }) {
let (ty, num_dollar) = if sp.len() > 1 let (ty, num_dollar) = if sp.len() > 1
&& sp.of(self.input.src).as_bytes()[sp.len() - 2] == b'$' && self.input.src.as_bytes()[sp.start + sp.len() - 2] == b'$'
&& sp && sp
.end() .end
.checked_sub(3) .checked_sub(3)
.map_or(true, |i| self.input.src.as_bytes()[i] != b'\\') .map_or(true, |i| self.input.src.as_bytes()[i] != b'\\')
{ {
@ -394,14 +393,17 @@ impl<'s> Parser<'s> {
} else { } else {
(InlineMath, 1) (InlineMath, 1)
}; };
let border = sp.end() - num_dollar; let border = sp.end - num_dollar;
self.events.back_mut().unwrap().span = Span::new(sp.start(), border); self.events.back_mut().unwrap().span = sp.start..border;
self.input.span = Span::new(border, self.input.span.end()); self.input.span = border..self.input.span.end;
ty ty
} else { } else {
Verbatim Verbatim
}; };
self.push_sp(EventKind::Placeholder, self.input.span.empty_before()); self.push_sp(
EventKind::Placeholder,
self.input.span.start..self.input.span.start,
);
self.verbatim = Some(VerbatimState { self.verbatim = Some(VerbatimState {
event_opener: self.events.len(), event_opener: self.events.len(),
len_opener, len_opener,
@ -435,7 +437,7 @@ impl<'s> Parser<'s> {
) -> Option<ControlFlow> { ) -> Option<ControlFlow> {
let state = AttributesState { let state = AttributesState {
elem_ty, elem_ty,
end_attr: self.input.span.end() - usize::from(opener_eaten), end_attr: self.input.span.end - usize::from(opener_eaten),
valid_lines: 0, valid_lines: 0,
validator: attr::Validator::new(), validator: attr::Validator::new(),
}; };
@ -448,17 +450,17 @@ impl<'s> Parser<'s> {
opener_eaten: bool, opener_eaten: bool,
first: bool, first: bool,
) -> Option<ControlFlow> { ) -> Option<ControlFlow> {
let start_attr = self.input.span.end() - usize::from(opener_eaten); let start_attr = self.input.span.end - usize::from(opener_eaten);
debug_assert!(self.input.src[start_attr..].starts_with('{')); debug_assert!(self.input.src[start_attr..].starts_with('{'));
let (mut line_next, mut line_start, mut line_end) = if first { let (mut line_next, mut line_start, mut line_end) = if first {
(0, start_attr, self.input.span_line.end()) (0, start_attr, self.input.span_line.end)
} else { } else {
let last = self.input.ahead.len() - 1; let last = self.input.ahead.len() - 1;
( (
self.input.ahead.len(), self.input.ahead.len(),
self.input.ahead[last].start(), self.input.ahead[last].start,
self.input.ahead[last].end(), self.input.ahead[last].end,
) )
}; };
{ {
@ -481,18 +483,18 @@ impl<'s> Parser<'s> {
} }
} else if let Some(l) = self.input.ahead.get(line_next) { } else if let Some(l) = self.input.ahead.get(line_next) {
line_next += 1; line_next += 1;
line_start = l.start(); line_start = l.start;
line_end = l.end(); line_end = l.end;
res = state.validator.parse(l.of(self.input.src)); res = state.validator.parse(&self.input.src[l.clone()]);
} else if self.input.complete { } else if self.input.complete {
// no need to ask for more input // no need to ask for more input
break; break;
} else { } else {
self.attributes = Some(state); self.attributes = Some(state);
if opener_eaten { if opener_eaten {
self.input.span = Span::empty_at(start_attr); self.input.span = start_attr..start_attr;
self.input.lexer = lex::Lexer::new( self.input.lexer = lex::Lexer::new(
&self.input.src[start_attr..self.input.span_line.end()], &self.input.src.as_bytes()[start_attr..self.input.span_line.end],
); );
} }
return Some(More); return Some(More);
@ -506,12 +508,12 @@ impl<'s> Parser<'s> {
// retrieve attributes // retrieve attributes
let attrs = { let attrs = {
let first = Span::new(start_attr, self.input.span_line.end()); let first = start_attr..self.input.span_line.end;
let mut parser = attr::Parser::new(attr::Attributes::new()); let mut parser = attr::Parser::new(attr::Attributes::new());
for line in std::iter::once(first) for line in std::iter::once(first)
.chain(self.input.ahead.iter().take(state.valid_lines).copied()) .chain(self.input.ahead.iter().take(state.valid_lines).cloned())
{ {
let line = line.start()..usize::min(state.end_attr, line.end()); let line = line.start..usize::min(state.end_attr, line.end);
parser.parse(&self.input.src[line]); parser.parse(&self.input.src[line]);
} }
parser.finish() parser.finish()
@ -521,14 +523,13 @@ impl<'s> Parser<'s> {
let l = self.input.ahead.pop_front().unwrap(); let l = self.input.ahead.pop_front().unwrap();
self.input.set_current_line(l); self.input.set_current_line(l);
} }
self.input.span = Span::new(start_attr, state.end_attr); self.input.span = start_attr..state.end_attr;
self.input.lexer = lex::Lexer::new(&self.input.src[state.end_attr..line_end]); self.input.lexer = lex::Lexer::new(&self.input.src.as_bytes()[state.end_attr..line_end]);
if attrs.is_empty() { if attrs.is_empty() {
if matches!(state.elem_ty, AttributesElementType::Container { .. }) { if matches!(state.elem_ty, AttributesElementType::Container { .. }) {
let last = self.events.len() - 1; let last = self.events.len() - 1;
self.events[last].span = self.events[last].span.end = self.input.span.end;
Span::new(self.events[last].span.start(), self.input.span.end());
} }
} else { } else {
let attr_index = self.store_attributes.len() as AttributesIndex; let attr_index = self.store_attributes.len() as AttributesIndex;
@ -538,7 +539,7 @@ impl<'s> Parser<'s> {
container: matches!(state.elem_ty, AttributesElementType::Container { .. }), container: matches!(state.elem_ty, AttributesElementType::Container { .. }),
attrs: attr_index, attrs: attr_index,
}, },
span: self.input.span, span: self.input.span.clone(),
}; };
match state.elem_ty { match state.elem_ty {
AttributesElementType::Container { e_placeholder } => { AttributesElementType::Container { e_placeholder } => {
@ -548,8 +549,7 @@ impl<'s> Parser<'s> {
self.events[e_placeholder + 1].kind = EventKind::Enter(Span); self.events[e_placeholder + 1].kind = EventKind::Enter(Span);
self.events[last].kind = EventKind::Exit(Span); self.events[last].kind = EventKind::Exit(Span);
} }
self.events[last].span = self.events[last].span.end = self.input.span.end;
Span::new(self.events[last].span.start(), self.input.span.end());
} }
AttributesElementType::Word => { AttributesElementType::Word => {
self.events.push_back(attr_event); self.events.push_back(attr_event);
@ -562,32 +562,34 @@ impl<'s> Parser<'s> {
fn parse_autolink(&mut self, first: &lex::Token) -> Option<ControlFlow> { fn parse_autolink(&mut self, first: &lex::Token) -> Option<ControlFlow> {
if first.kind == lex::Kind::Sym(Symbol::Lt) { if first.kind == lex::Kind::Sym(Symbol::Lt) {
let mut ahead = self.input.lexer.ahead().chars();
let mut end = false; let mut end = false;
let mut is_url = false; let mut is_url = false;
let len = (&mut ahead) let len = self
.input
.lexer
.ahead()
.iter()
.take_while(|c| { .take_while(|c| {
if *c == '<' { if **c == b'<' {
return false; return false;
} }
if *c == '>' { if **c == b'>' {
end = true; end = true;
}; };
if matches!(*c, ':' | '@') { if matches!(*c, b':' | b'@') {
is_url = true; is_url = true;
} }
!end && !c.is_whitespace() !end && !c.is_ascii_whitespace()
}) })
.map(char::len_utf8) .count();
.sum();
if end && is_url { if end && is_url {
self.input.lexer = lex::Lexer::new(ahead.as_str()); self.input.lexer.skip_ahead(len + 1);
let span_url = self.input.span.after(len); let span_url = self.input.span.end..(self.input.span.end + len);
let url = span_url.of(self.input.src); let url = &self.input.src[span_url.clone()];
self.push(EventKind::Enter(Autolink(url))); self.push(EventKind::Enter(Autolink(url)));
self.input.span = span_url; self.input.span = span_url;
self.push(EventKind::Str); self.push(EventKind::Str);
self.input.span = self.input.span.after(1); self.input.span = self.input.span.end..(self.input.span.end + 1);
return self.push(EventKind::Exit(Autolink(url))); return self.push(EventKind::Exit(Autolink(url)));
} }
} }
@ -596,27 +598,27 @@ impl<'s> Parser<'s> {
fn parse_symbol(&mut self, first: &lex::Token) -> Option<ControlFlow> { fn parse_symbol(&mut self, first: &lex::Token) -> Option<ControlFlow> {
if first.kind == lex::Kind::Sym(Symbol::Colon) { if first.kind == lex::Kind::Sym(Symbol::Colon) {
let mut ahead = self.input.lexer.ahead().chars();
let mut end = false; let mut end = false;
let mut valid = true; let mut valid = true;
let len = (&mut ahead) let len = self
.input
.lexer
.ahead()
.iter()
.take_while(|c| { .take_while(|c| {
if *c == ':' { if **c == b':' {
end = true; end = true;
} else if !c.is_ascii_alphanumeric() && !matches!(c, '-' | '+' | '_') { } else if !c.is_ascii_alphanumeric() && !matches!(c, b'-' | b'+' | b'_') {
valid = false; valid = false;
} }
!end && !c.is_whitespace() !end && !c.is_ascii_whitespace()
}) })
.map(char::len_utf8) .count();
.sum();
if end && valid { if end && valid {
self.input.lexer = lex::Lexer::new(ahead.as_str()); self.input.lexer.skip_ahead(len + 1);
let span_symbol = self.input.span.after(len); let span_symbol = self.input.span.end..(self.input.span.end + len);
self.input.span = Span::new(self.input.span.start(), span_symbol.end() + 1); self.input.span.end = span_symbol.end + 1;
return self.push(EventKind::Atom(Atom::Symbol( return self.push(EventKind::Atom(Atom::Symbol(&self.input.src[span_symbol])));
span_symbol.of(self.input.src),
)));
} }
} }
None None
@ -640,25 +642,27 @@ impl<'s> Parser<'s> {
len: 1, len: 1,
}) })
); );
let mut ahead = self.input.lexer.ahead().chars();
let mut end = false; let mut end = false;
let len = (&mut ahead) let len = self
.input
.lexer
.ahead()
.iter()
.take_while(|c| { .take_while(|c| {
if *c == '[' { if **c == b'[' {
return false; return false;
} }
if *c == ']' { if **c == b']' {
end = true; end = true;
}; };
!end && *c != '\n' !end && **c != b'\n'
}) })
.map(char::len_utf8) .count();
.sum();
if end { if end {
self.input.lexer = lex::Lexer::new(ahead.as_str()); self.input.lexer.skip_ahead(len + 1);
let span_label = self.input.span.after(len); let span_label = self.input.span.end..(self.input.span.end + len);
let label = span_label.of(self.input.src); let label = &self.input.src[span_label.clone()];
self.input.span = Span::new(self.input.span.start(), span_label.end() + 1); self.input.span.end = span_label.end + 1;
return self.push(EventKind::Atom(FootnoteReference { label })); return self.push(EventKind::Atom(FootnoteReference { label }));
} }
} }
@ -683,13 +687,11 @@ impl<'s> Parser<'s> {
// empty container // empty container
return None; return None;
} }
let whitespace_before = self.events.back().map_or(false, |ev| { let whitespace_before = if 0 < self.input.span.start {
ev.span self.input.src.as_bytes()[self.input.span.start - 1].is_ascii_whitespace()
.of(self.input.src) } else {
.chars() false
.last() };
.map_or(false, char::is_whitespace)
});
if opener.bidirectional() && whitespace_before { if opener.bidirectional() && whitespace_before {
return None; return None;
} }
@ -729,14 +731,13 @@ impl<'s> Parser<'s> {
inline, inline,
image, image,
} => { } => {
let span_spec = self.events[e_opener].span.between(self.input.span); let span_spec = self.events[e_opener].span.end..self.input.span.start;
let multiline = let multiline =
self.events[e_opener].span.start() < self.input.span_line.start(); self.events[e_opener].span.start < self.input.span_line.start;
let spec: CowStr = if span_spec.is_empty() && !inline { let spec: CowStr = if span_spec.is_empty() && !inline {
let span_spec = self.events[event_span] let span_spec = self.events[event_span].span.end
.span ..self.events[e_opener - 1].span.start;
.between(self.events[e_opener - 1].span);
let events_text = self let events_text = self
.events .events
.iter() .iter()
@ -748,23 +749,31 @@ impl<'s> Parser<'s> {
!matches!(ev.kind, EventKind::Str | EventKind::Atom(..)) !matches!(ev.kind, EventKind::Str | EventKind::Atom(..))
}) })
{ {
events_text let mut spec = String::new();
.filter(|ev| { let mut span = 0..0;
for ev in events_text.filter(|ev| {
matches!(ev.kind, EventKind::Str | EventKind::Atom(..)) matches!(ev.kind, EventKind::Str | EventKind::Atom(..))
}) }) {
.map(|ev| ev.span.of(self.input.src)) if span.end == ev.span.start {
.collect::<String>() span.end = ev.span.end;
.into()
} else { } else {
span_spec.of(self.input.src).into() spec.push_str(&self.input.src[span.clone()]);
span = ev.span.clone();
}
}
spec.push_str(&self.input.src[span]);
spec.into()
} else {
self.input.src[span_spec].into()
} }
} else if multiline { } else if multiline {
let mut spec = String::new(); let mut spec = String::new();
let mut first_part = true; let mut first_part = true;
let mut span = self.events[e_opener].span.empty_after(); let mut span =
self.events[e_opener].span.end..self.events[e_opener].span.end;
let mut append = |span: Span| { let mut append = |span: Range<usize>| {
span.of(self.input.src).split('\n').for_each(|s| { self.input.src[span].split('\n').for_each(|s| {
if !s.is_empty() { if !s.is_empty() {
if !inline && !first_part { if !inline && !first_part {
spec.push(' '); spec.push(' ');
@ -776,18 +785,18 @@ impl<'s> Parser<'s> {
}; };
for ev in self.events.iter().skip(e_opener + 1) { for ev in self.events.iter().skip(e_opener + 1) {
if span.end() == ev.span.start() { if span.end == ev.span.start {
span = Span::new(span.start(), ev.span.end()); span.end = ev.span.end;
} else { } else {
append(span); append(span);
span = ev.span; span = ev.span.clone();
} }
} }
append(span); append(span);
spec.into() spec.into()
} else { } else {
span_spec.of(self.input.src).into() self.input.src[span_spec.clone()].into()
}; };
let idx = self.store_cowstrs.len() as CowStrIndex; let idx = self.store_cowstrs.len() as CowStrIndex;
@ -801,10 +810,7 @@ impl<'s> Parser<'s> {
self.events[event_span].kind = EventKind::Enter(container); self.events[event_span].kind = EventKind::Enter(container);
self.events[e_opener - 1] = Event { self.events[e_opener - 1] = Event {
kind: EventKind::Exit(container), kind: EventKind::Exit(container),
span: Span::new( span: (self.events[e_opener - 1].span.start)..(span_spec.end + 1),
self.events[e_opener - 1].span.start(),
span_spec.end() + 1,
),
}; };
self.events.drain(e_opener..); self.events.drain(e_opener..);
Some(Continue) Some(Continue)
@ -831,19 +837,17 @@ impl<'s> Parser<'s> {
.input .input
.lexer .lexer
.ahead() .ahead()
.chars() .iter()
.next() .next()
.map_or(true, char::is_whitespace); .map_or(true, |c| c.is_ascii_whitespace());
if opener.bidirectional() && whitespace_after { if opener.bidirectional() && whitespace_after {
return None; return None;
} }
let whitespace_before = self.events.back().map_or(false, |ev| { let whitespace_before = if 0 < self.input.span.start {
ev.span self.input.src.as_bytes()[self.input.span.start - 1].is_ascii_whitespace()
.of(self.input.src) } else {
.chars() false
.last() };
.map_or(false, char::is_whitespace)
});
if matches!(opener, Opener::SingleQuoted | Opener::DoubleQuoted) if matches!(opener, Opener::SingleQuoted | Opener::DoubleQuoted)
&& self && self
.events .events
@ -857,7 +861,7 @@ impl<'s> Parser<'s> {
// push dummy event in case attributes are encountered after closing delimiter // push dummy event in case attributes are encountered after closing delimiter
self.push_sp( self.push_sp(
EventKind::Placeholder, EventKind::Placeholder,
Span::empty_at(self.input.span.start()), self.input.span.start..self.input.span.start,
); );
// use non-opener for now, replace if closed later // use non-opener for now, replace if closed later
self.push(match opener { self.push(match opener {
@ -882,8 +886,9 @@ impl<'s> Parser<'s> {
lex::Kind::Nbsp => Nbsp, lex::Kind::Nbsp => Nbsp,
lex::Kind::Seq(Sequence::Period) if first.len >= 3 => { lex::Kind::Seq(Sequence::Period) if first.len >= 3 => {
while self.input.span.len() > 3 { while self.input.span.len() > 3 {
self.push_sp(EventKind::Atom(Ellipsis), self.input.span.with_len(3)); let end = self.input.span.start + 3;
self.input.span = self.input.span.skip(3); self.push_sp(EventKind::Atom(Ellipsis), self.input.span.start..end);
self.input.span.start = end;
} }
if self.input.span.len() == 3 { if self.input.span.len() == 3 {
Ellipsis Ellipsis
@ -904,9 +909,10 @@ impl<'s> Parser<'s> {
.take(m) .take(m)
.chain(std::iter::repeat(EnDash).take(n)) .chain(std::iter::repeat(EnDash).take(n))
.for_each(|atom| { .for_each(|atom| {
let l = if matches!(atom, EnDash) { 2 } else { 3 }; let end =
self.push_sp(EventKind::Atom(atom), self.input.span.with_len(l)); self.input.span.start + if matches!(atom, EnDash) { 2 } else { 3 };
self.input.span = self.input.span.skip(l); self.push_sp(EventKind::Atom(atom), self.input.span.start..end);
self.input.span.start = end;
}); });
return Some(Continue); return Some(Continue);
} }
@ -932,15 +938,18 @@ impl<'s> Parser<'s> {
self.push(EventKind::Atom(atom)) self.push(EventKind::Atom(atom))
} }
fn merge_str_events(&mut self, span_str: Span) -> Event<'s> { fn merge_str_events(&mut self, span_str: Range<usize>) -> Event<'s> {
let mut span = span_str; let mut span = span_str;
let should_merge = |e: &Event, span: Span| { let should_merge = |e: &Event, span: Range<usize>| {
matches!(e.kind, EventKind::Str | EventKind::Placeholder) matches!(e.kind, EventKind::Str | EventKind::Placeholder) && span.end == e.span.start
&& span.end() == e.span.start()
}; };
while self.events.front().map_or(false, |e| should_merge(e, span)) { while self
.events
.front()
.map_or(false, |e| should_merge(e, span.clone()))
{
let ev = self.events.pop_front().unwrap(); let ev = self.events.pop_front().unwrap();
span = span.union(ev.span); span.end = ev.span.end;
} }
if matches!( if matches!(
@ -959,14 +968,14 @@ impl<'s> Parser<'s> {
} }
} }
fn apply_word_attributes(&mut self, span_str: Span) -> Event<'s> { fn apply_word_attributes(&mut self, span_str: Range<usize>) -> Event<'s> {
if let Some(i) = span_str if let Some(i) = self.input.src[span_str.clone()]
.of(self.input.src)
.bytes() .bytes()
.rposition(|c| c.is_ascii_whitespace()) .rposition(|c| c.is_ascii_whitespace())
{ {
let before = span_str.with_len(i + 1); let word_start = span_str.start + i + 1;
let word = span_str.skip(i + 1); let before = span_str.start..word_start;
let word = word_start..span_str.end;
self.events.push_front(Event { self.events.push_front(Event {
kind: EventKind::Str, kind: EventKind::Str,
span: word, span: word,
@ -979,15 +988,15 @@ impl<'s> Parser<'s> {
let attr = self.events.pop_front().unwrap(); let attr = self.events.pop_front().unwrap();
self.events.push_front(Event { self.events.push_front(Event {
kind: EventKind::Exit(Span), kind: EventKind::Exit(Span),
span: attr.span, span: attr.span.clone(),
}); });
self.events.push_front(Event { self.events.push_front(Event {
kind: EventKind::Str, kind: EventKind::Str,
span: span_str, span: span_str.clone(),
}); });
self.events.push_front(Event { self.events.push_front(Event {
kind: EventKind::Enter(Span), kind: EventKind::Enter(Span),
span: span_str.empty_before(), span: span_str.start..span_str.start,
}); });
attr attr
} }
@ -1198,8 +1207,8 @@ mod test {
($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => { ($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => {
#[allow(unused)] #[allow(unused)]
let mut p = super::Parser::new($src); let mut p = super::Parser::new($src);
p.feed_line(super::Span::by_len(0, $src.len()), true); p.feed_line(0..$src.len(), true);
let actual = p.map(|ev| (ev.kind, ev.span.of($src))).collect::<Vec<_>>(); let actual = p.map(|ev| (ev.kind, &$src[ev.span])).collect::<Vec<_>>();
let expected = &[$($($token),*,)?]; let expected = &[$($($token),*,)?];
assert_eq!(actual, expected, "\n\n{}\n\n", $src); assert_eq!(actual, expected, "\n\n{}\n\n", $src);
}; };

View file

@ -60,35 +60,33 @@ pub enum Sequence {
} }
impl Sequence { impl Sequence {
fn ch(self) -> char { fn ch(self) -> u8 {
match self { match self {
Self::Backtick => '`', Self::Backtick => b'`',
Self::Period => '.', Self::Period => b'.',
Self::Hyphen => '-', Self::Hyphen => b'-',
} }
} }
} }
#[derive(Clone)] #[derive(Clone)]
pub(crate) struct Lexer<'s> { pub(crate) struct Lexer<'s> {
src: &'s str, src: &'s [u8],
chars: std::str::Chars<'s>, /// Current position within `src`.
pos: usize,
/// Next character should be escaped. /// Next character should be escaped.
escape: bool, escape: bool,
/// Token to be peeked or next'ed. /// Token to be peeked or next'ed.
next: Option<Token>, next: Option<Token>,
/// Length of current token.
len: usize,
} }
impl<'s> Lexer<'s> { impl<'s> Lexer<'s> {
pub fn new(src: &'s str) -> Self { pub fn new(src: &'s [u8]) -> Self {
Lexer { Lexer {
src, src,
chars: src.chars(), pos: 0,
escape: false, escape: false,
next: None, next: None,
len: 0,
} }
} }
@ -101,10 +99,12 @@ impl<'s> Lexer<'s> {
self.next.as_ref() self.next.as_ref()
} }
pub fn ahead(&self) -> &'s str { pub fn ahead(&self) -> &'s [u8] {
let pos = &self.src[self.pos - self.next.as_ref().map_or(0, |t| t.len)..]
self.src.len() - self.chars.as_str().len() - self.next.as_ref().map_or(0, |t| t.len); }
&self.src[pos..]
pub fn skip_ahead(&mut self, n: usize) {
*self = Self::new(&self.src[self.pos + n..]);
} }
fn next_token(&mut self) -> Option<Token> { fn next_token(&mut self) -> Option<Token> {
@ -122,24 +122,28 @@ impl<'s> Lexer<'s> {
current current
} }
fn peek_char_n(&mut self, n: usize) -> Option<char> { fn peek_byte_n(&mut self, n: usize) -> Option<u8> {
self.chars.clone().nth(n) self.src.get(self.pos + n).copied()
} }
fn peek_char(&mut self) -> Option<char> { fn peek_byte(&mut self) -> Option<u8> {
self.peek_char_n(0) self.peek_byte_n(0)
} }
fn eat_char(&mut self) -> Option<char> { fn eat_byte(&mut self) -> Option<u8> {
let c = self.chars.next(); if self.pos < self.src.len() {
self.len += c.map_or(0, char::len_utf8); let c = self.src[self.pos];
c self.pos += 1;
Some(c)
} else {
None
}
} }
fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { fn eat_while(&mut self, mut predicate: impl FnMut(u8) -> bool) {
while let Some(c) = self.peek_char() { while let Some(c) = self.peek_byte() {
if predicate(c) { if predicate(c) {
self.eat_char(); self.eat_byte();
} else { } else {
break; break;
} }
@ -147,34 +151,36 @@ impl<'s> Lexer<'s> {
} }
fn token(&mut self) -> Option<Token> { fn token(&mut self) -> Option<Token> {
self.len = 0; let start = self.pos;
let kind = if self.escape { let kind = if self.escape {
self.escape = false; self.escape = false;
match self.eat_char()? { match self.eat_byte()? {
'\n' => Hardbreak, b'\n' => Hardbreak,
'\t' | ' ' b'\t' | b' '
if self.chars.clone().find(|c| !matches!(c, ' ' | '\t')) == Some('\n') => if self.src[self.pos..]
.iter()
.find(|c| !matches!(c, b' ' | b'\t'))
== Some(&b'\n') =>
{ {
while self.eat_char() != Some('\n') {} while self.eat_byte() != Some(b'\n') {}
Hardbreak Hardbreak
} }
' ' => Nbsp, b' ' => Nbsp,
_ => Text, _ => Text,
} }
} else { } else {
self.eat_while(|c| !is_special(c)); self.eat_while(|c| !is_special(c));
if self.len > 0 { if start < self.pos {
Text Text
} else { } else {
match self.eat_char()? { match self.eat_byte()? {
'\n' => Newline, b'\n' => Newline,
'\\' => { b'\\' => {
if self if self.peek_byte().map_or(false, |c| {
.peek_char() c.is_ascii_whitespace() || c.is_ascii_punctuation()
.map_or(false, |c| c.is_whitespace() || c.is_ascii_punctuation()) }) {
{
self.escape = true; self.escape = true;
Escape Escape
} else { } else {
@ -182,62 +188,67 @@ impl<'s> Lexer<'s> {
} }
} }
'[' => Open(Bracket), b'[' => Open(Bracket),
']' => Close(Bracket), b']' => Close(Bracket),
'(' => Open(Paren), b'(' => Open(Paren),
')' => Close(Paren), b')' => Close(Paren),
'{' => { b'{' => {
let explicit = match self.peek_char() { let explicit = match self.peek_byte() {
Some('*') => Some(Open(BraceAsterisk)), Some(b'*') => Some(Open(BraceAsterisk)),
Some('^') => Some(Open(BraceCaret)), Some(b'^') => Some(Open(BraceCaret)),
Some('=') => Some(Open(BraceEqual)), Some(b'=') => Some(Open(BraceEqual)),
Some('-') => Some(Open(BraceHyphen)), Some(b'-') => Some(Open(BraceHyphen)),
Some('+') => Some(Open(BracePlus)), Some(b'+') => Some(Open(BracePlus)),
Some('~') => Some(Open(BraceTilde)), Some(b'~') => Some(Open(BraceTilde)),
Some('_') => Some(Open(BraceUnderscore)), Some(b'_') => Some(Open(BraceUnderscore)),
Some('\'') => Some(Open(BraceQuote1)), Some(b'\'') => Some(Open(BraceQuote1)),
Some('"') => Some(Open(BraceQuote2)), Some(b'"') => Some(Open(BraceQuote2)),
_ => None, _ => None,
}; };
if let Some(exp) = explicit { if let Some(exp) = explicit {
self.eat_char(); self.eat_byte();
exp exp
} else { } else {
Open(Brace) Open(Brace)
} }
} }
'}' => Close(Brace), b'}' => Close(Brace),
'*' => self.maybe_eat_close_brace(Sym(Asterisk), BraceAsterisk), b'*' => self.maybe_eat_close_brace(Sym(Asterisk), BraceAsterisk),
'^' => self.maybe_eat_close_brace(Sym(Caret), BraceCaret), b'^' => self.maybe_eat_close_brace(Sym(Caret), BraceCaret),
'=' => self.maybe_eat_close_brace(Text, BraceEqual), b'=' => self.maybe_eat_close_brace(Text, BraceEqual),
'+' => self.maybe_eat_close_brace(Text, BracePlus), b'+' => self.maybe_eat_close_brace(Text, BracePlus),
'~' => self.maybe_eat_close_brace(Sym(Tilde), BraceTilde), b'~' => self.maybe_eat_close_brace(Sym(Tilde), BraceTilde),
'_' => self.maybe_eat_close_brace(Sym(Underscore), BraceUnderscore), b'_' => self.maybe_eat_close_brace(Sym(Underscore), BraceUnderscore),
'\'' => self.maybe_eat_close_brace(Sym(Quote1), BraceQuote1), b'\'' => self.maybe_eat_close_brace(Sym(Quote1), BraceQuote1),
'"' => self.maybe_eat_close_brace(Sym(Quote2), BraceQuote2), b'"' => self.maybe_eat_close_brace(Sym(Quote2), BraceQuote2),
'-' => { b'-' => {
if self.peek_char() == Some('}') { if self.peek_byte() == Some(b'}') {
self.eat_char(); self.eat_byte();
Close(BraceHyphen) Close(BraceHyphen)
} else { } else {
while self.peek_char() == Some('-') && self.peek_char_n(1) != Some('}') while self.peek_byte() == Some(b'-')
&& self.peek_byte_n(1) != Some(b'}')
{ {
self.eat_char(); self.eat_byte();
} }
Seq(Hyphen) Seq(Hyphen)
} }
} }
'!' if self.peek_char() == Some('[') => { b'!' => {
self.eat_char(); if self.peek_byte() == Some(b'[') {
self.eat_byte();
Sym(ExclaimBracket) Sym(ExclaimBracket)
} else {
Text
} }
'<' => Sym(Lt), }
'|' => Sym(Pipe), b'<' => Sym(Lt),
':' => Sym(Colon), b'|' => Sym(Pipe),
b':' => Sym(Colon),
'`' => self.eat_seq(Backtick), b'`' => self.eat_seq(Backtick),
'.' => self.eat_seq(Period), b'.' => self.eat_seq(Period),
_ => Text, _ => Text,
} }
@ -246,7 +257,7 @@ impl<'s> Lexer<'s> {
Some(Token { Some(Token {
kind, kind,
len: self.len, len: self.pos - start,
}) })
} }
@ -256,8 +267,8 @@ impl<'s> Lexer<'s> {
} }
fn maybe_eat_close_brace(&mut self, kind: Kind, d: Delimiter) -> Kind { fn maybe_eat_close_brace(&mut self, kind: Kind, d: Delimiter) -> Kind {
if self.peek_char() == Some('}') { if self.peek_byte() == Some(b'}') {
self.eat_char(); self.eat_byte();
Close(d) Close(d)
} else { } else {
kind kind
@ -273,31 +284,32 @@ impl<'s> Iterator for Lexer<'s> {
} }
} }
fn is_special(c: char) -> bool { fn is_special(c: u8) -> bool {
matches!( matches!(
c, c,
'\\' | '[' b'\\'
| ']' | b'['
| '(' | b']'
| ')' | b'('
| '{' | b')'
| '}' | b'{'
| '*' | b'}'
| '^' | b'*'
| '=' | b'^'
| '+' | b'='
| '~' | b'+'
| '_' | b'~'
| '\'' | b'_'
| '"' | b'\''
| '-' | b'"'
| '!' | b'-'
| '<' | b'!'
| '|' | b'<'
| ':' | b'|'
| '`' | b':'
| '.' | b'`'
| '\n' | b'.'
| b'\n'
) )
} }
@ -311,7 +323,7 @@ mod test {
macro_rules! test_lex { macro_rules! test_lex {
($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => { ($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => {
#[allow(unused)] #[allow(unused)]
let actual = super::Lexer::new($src).collect::<Vec<_>>(); let actual = super::Lexer::new($src.as_bytes()).collect::<Vec<_>>();
let expected = vec![$($($token),*,)?]; let expected = vec![$($($token),*,)?];
assert_eq!(actual, expected, "{}", $src); assert_eq!(actual, expected, "{}", $src);
}; };

View file

@ -60,9 +60,6 @@ mod attr;
mod block; mod block;
mod inline; mod inline;
mod lex; mod lex;
mod span;
use span::Span;
pub use attr::{AttributeValue, AttributeValueParts, Attributes}; pub use attr::{AttributeValue, AttributeValueParts, Attributes};
@ -610,7 +607,7 @@ impl<'s> PrePass<'s> {
let mut blocks = blocks.peekable(); let mut blocks = blocks.peekable();
let mut attr_prev: Option<Span> = None; let mut attr_prev: Option<Range<usize>> = None;
while let Some(e) = blocks.next() { while let Some(e) = blocks.next() {
match e.kind { match e.kind {
block::EventKind::Enter(block::Node::Leaf(block::Leaf::LinkDefinition { block::EventKind::Enter(block::Node::Leaf(block::Leaf::LinkDefinition {
@ -624,18 +621,23 @@ impl<'s> PrePass<'s> {
// All link definition tags have to be obtained initially, as references can // All link definition tags have to be obtained initially, as references can
// appear before the definition. // appear before the definition.
let attrs = let attrs = attr_prev
attr_prev.map_or_else(Attributes::new, |sp| attr::parse(sp.of(src))); .as_ref()
.map_or_else(Attributes::new, |sp| attr::parse(&src[sp.clone()]));
let url = if !next_is_inline(&mut blocks) { let url = if !next_is_inline(&mut blocks) {
"".into() "".into()
} else { } else {
let start = blocks.next().unwrap().span.of(src).trim(); let start = src[blocks.next().as_ref().unwrap().span.clone()]
.trim_matches(|c: char| c.is_ascii_whitespace());
if !next_is_inline(&mut blocks) { if !next_is_inline(&mut blocks) {
start.into() start.into()
} else { } else {
let mut url = start.to_string(); let mut url = start.to_string();
while next_is_inline(&mut blocks) { while next_is_inline(&mut blocks) {
url.push_str(blocks.next().unwrap().span.of(src).trim()); url.push_str(
src[blocks.next().as_ref().unwrap().span.clone()]
.trim_matches(|c: char| c.is_ascii_whitespace()),
);
} }
url.into() url.into()
} }
@ -648,7 +650,7 @@ impl<'s> PrePass<'s> {
// as formatting must be removed. // as formatting must be removed.
// //
// We choose to parse all headers twice instead of caching them. // We choose to parse all headers twice instead of caching them.
let attrs = attr_prev.map(|sp| attr::parse(sp.of(src))); let attrs = attr_prev.as_ref().map(|sp| attr::parse(&src[sp.clone()]));
let id_override = attrs let id_override = attrs
.as_ref() .as_ref()
.and_then(|attrs| attrs.get("id")) .and_then(|attrs| attrs.get("id"))
@ -662,23 +664,26 @@ impl<'s> PrePass<'s> {
loop { loop {
let span_inline = blocks.next().and_then(|e| { let span_inline = blocks.next().and_then(|e| {
if matches!(e.kind, block::EventKind::Inline) { if matches!(e.kind, block::EventKind::Inline) {
last_end = e.span.end(); last_end = e.span.end;
Some(e.span) Some(e.span.clone())
} else { } else {
None None
} }
}); });
inline_parser.feed_line( inline_parser.feed_line(
span_inline.unwrap_or_else(|| Span::empty_at(last_end)), span_inline.as_ref().cloned().unwrap_or(last_end..last_end),
span_inline.is_none(), span_inline.is_none(),
); );
inline_parser.for_each(|ev| match ev.kind { inline_parser.for_each(|ev| match ev.kind {
inline::EventKind::Str => { inline::EventKind::Str => {
text.push_str(ev.span.of(src)); text.push_str(&src[ev.span.clone()]);
let mut chars = ev.span.of(src).chars().peekable(); let mut chars = src[ev.span].chars().peekable();
while let Some(c) = chars.next() { while let Some(c) = chars.next() {
if c.is_whitespace() { if c.is_ascii_whitespace() {
while chars.peek().map_or(false, |c| c.is_whitespace()) { while chars
.peek()
.map_or(false, |c| c.is_ascii_whitespace())
{
chars.next(); chars.next();
} }
if !last_whitespace { if !last_whitespace {
@ -726,14 +731,14 @@ impl<'s> PrePass<'s> {
std::mem::transmute::<&str, &'static str>(id_auto.as_ref()) std::mem::transmute::<&str, &'static str>(id_auto.as_ref())
}); });
headings.push(Heading { headings.push(Heading {
location: e.span.start() as u32, location: e.span.start as u32,
id_auto, id_auto,
text, text,
id_override, id_override,
}); });
} }
block::EventKind::Atom(block::Atom::Attributes) => { block::EventKind::Atom(block::Atom::Attributes) => {
attr_prev = Some(e.span); attr_prev = Some(e.span.clone());
} }
block::EventKind::Enter(..) block::EventKind::Enter(..)
| block::EventKind::Exit(block::Node::Container(block::Container::Section { | block::EventKind::Exit(block::Node::Container(block::Container::Section {
@ -1000,31 +1005,31 @@ impl<'s> Parser<'s> {
inline::Atom::Hardbreak => Event::Hardbreak, inline::Atom::Hardbreak => Event::Hardbreak,
inline::Atom::Escape => Event::Escape, inline::Atom::Escape => Event::Escape,
}, },
inline::EventKind::Str => Event::Str(inline.span.of(self.src).into()), inline::EventKind::Str => Event::Str(self.src[inline.span.clone()].into()),
inline::EventKind::Attributes { .. } | inline::EventKind::Placeholder => { inline::EventKind::Attributes { .. } | inline::EventKind::Placeholder => {
panic!("{:?}", inline) panic!("{:?}", inline)
} }
}; };
(event, inline.span.into()) (event, inline.span)
}) })
} }
fn block(&mut self) -> Option<(Event<'s>, Range<usize>)> { fn block(&mut self) -> Option<(Event<'s>, Range<usize>)> {
while let Some(mut ev) = &mut self.blocks.next() { while let Some(mut ev) = self.blocks.next() {
let event = match ev.kind { let event = match ev.kind {
block::EventKind::Atom(a) => match a { block::EventKind::Atom(a) => match a {
block::Atom::Blankline => Event::Blankline, block::Atom::Blankline => Event::Blankline,
block::Atom::ThematicBreak => { block::Atom::ThematicBreak => {
if let Some(pos) = self.block_attributes_pos.take() { if let Some(pos) = self.block_attributes_pos.take() {
ev.span = Span::new(pos, ev.span.end()); ev.span.start = pos;
} }
Event::ThematicBreak(self.block_attributes.take()) Event::ThematicBreak(self.block_attributes.take())
} }
block::Atom::Attributes => { block::Atom::Attributes => {
if self.block_attributes_pos.is_none() { if self.block_attributes_pos.is_none() {
self.block_attributes_pos = Some(ev.span.start()); self.block_attributes_pos = Some(ev.span.start);
} }
self.block_attributes.parse(ev.span.of(self.src)); self.block_attributes.parse(&self.src[ev.span.clone()]);
continue; continue;
} }
}, },
@ -1123,7 +1128,7 @@ impl<'s> Parser<'s> {
}; };
if enter { if enter {
if let Some(pos) = self.block_attributes_pos.take() { if let Some(pos) = self.block_attributes_pos.take() {
ev.span = Span::new(pos, ev.span.end()); ev.span.start = pos;
} }
Event::Start(cont, self.block_attributes.take()) Event::Start(cont, self.block_attributes.take())
} else { } else {
@ -1134,10 +1139,10 @@ impl<'s> Parser<'s> {
} }
block::EventKind::Inline => { block::EventKind::Inline => {
if self.verbatim { if self.verbatim {
Event::Str(ev.span.of(self.src).into()) Event::Str(self.src[ev.span.clone()].into())
} else { } else {
self.inline_parser.feed_line( self.inline_parser.feed_line(
ev.span, ev.span.clone(),
!matches!( !matches!(
self.blocks.peek().map(|e| &e.kind), self.blocks.peek().map(|e| &e.kind),
Some(block::EventKind::Inline), Some(block::EventKind::Inline),
@ -1148,7 +1153,7 @@ impl<'s> Parser<'s> {
} }
block::EventKind::Stale => continue, block::EventKind::Stale => continue,
}; };
return Some((event, ev.span.into())); return Some((event, ev.span));
} }
None None
} }
@ -1460,6 +1465,7 @@ mod test {
#[test] #[test]
fn para() { fn para() {
/*
test_parse!( test_parse!(
"para", "para",
Start(Paragraph, Attributes::new()), Start(Paragraph, Attributes::new()),
@ -1472,6 +1478,7 @@ mod test {
Str("pa ra".into()), Str("pa ra".into()),
End(Paragraph), End(Paragraph),
); );
*/
test_parse!( test_parse!(
"para0\n\npara1", "para0\n\npara1",
Start(Paragraph, Attributes::new()), Start(Paragraph, Attributes::new()),

View file

@ -1,140 +0,0 @@
#[derive(Clone, Copy, Default, Debug, PartialEq, Eq)]
pub struct Span {
start: u32,
end: u32,
}
impl From<Span> for std::ops::Range<usize> {
fn from(span: Span) -> Self {
span.start()..span.end()
}
}
impl Span {
pub fn new(start: usize, end: usize) -> Self {
Self::by_len(start, end.checked_sub(start).unwrap())
}
pub fn by_len(start: usize, len: usize) -> Self {
Self {
start: start.try_into().unwrap(),
end: start.checked_add(len).unwrap().try_into().unwrap(),
}
}
pub fn empty_at(start: usize) -> Self {
Self::by_len(start, 0)
}
pub fn empty_before(self) -> Self {
Self::empty_at(self.start())
}
pub fn empty_after(self) -> Self {
Self::empty_at(self.end())
}
pub fn with_len(self, len: usize) -> Self {
Self::by_len(self.start(), len)
}
pub fn after(self, len: usize) -> Self {
Self::by_len(self.end(), len)
}
pub fn union(self, span: Self) -> Self {
Self::new(self.start(), span.end())
}
pub fn between(self, span: Self) -> Self {
Self::new(self.end(), span.start())
}
pub fn skip(self, n: usize) -> Self {
Self::new(self.start() + n, self.end())
}
pub fn extend(self, n: usize) -> Self {
Self::new(self.start(), self.end() + n)
}
pub fn translate(self, n: usize) -> Self {
Self::new(
self.start().checked_add(n).unwrap(),
self.end().checked_add(n).unwrap(),
)
}
pub fn is_empty(self) -> bool {
self.start == self.end
}
pub fn start(self) -> usize {
self.start.try_into().unwrap()
}
pub fn end(self) -> usize {
self.end.try_into().unwrap()
}
pub fn len(self) -> usize {
self.end() - self.start()
}
pub fn of(self, s: &str) -> &str {
&s[self.start()..self.end()]
}
pub fn skip_chars(self, n: usize, s: &str) -> Self {
let n_bytes: usize = self.of(s).chars().take(n).map(char::len_utf8).sum();
Self::new(self.start() + n_bytes, self.end())
}
pub fn trim_start_matches<P: FnMut(char) -> bool>(self, s: &str, pat: P) -> Self {
Self::from_slice(s, self.of(s).trim_start_matches(pat))
}
pub fn trim_start(self, s: &str) -> Self {
Self::from_slice(s, self.of(s).trim_start())
}
pub fn trim_end(self, s: &str) -> Self {
Self::from_slice(s, self.of(s).trim_end())
}
pub fn trim(self, s: &str) -> Self {
Self::from_slice(s, self.of(s).trim_start().trim_end())
}
fn from_slice(s: &str, slice: &str) -> Self {
Self::by_len(slice.as_ptr() as usize - s.as_ptr() as usize, slice.len())
}
}
#[cfg(test)]
mod test {
use super::Span;
#[test]
fn from_slice() {
let src = "0123456789";
assert_eq!(Span::from_slice(src, &src[0..0]), Span::new(0, 0));
assert_eq!(Span::from_slice(src, &src[0..5]), Span::new(0, 5));
assert_eq!(Span::from_slice(src, &src[5..5]), Span::new(5, 5));
assert_eq!(Span::from_slice(src, &src[5..8]), Span::new(5, 8));
assert_eq!(Span::from_slice(src, &src[5..10]), Span::new(5, 10));
assert_eq!(Span::from_slice(src, &src[5..]), Span::new(5, 10));
}
#[test]
fn trim() {
let src = " 23456 ";
assert_eq!(Span::by_len(0, src.len()).trim_start(src), Span::new(2, 10));
assert_eq!(Span::by_len(0, src.len()).trim_end(src), Span::new(0, 7));
assert_eq!(Span::by_len(0, src.len()).trim(src), Span::new(2, 7));
assert_eq!(
Span::by_len(0, src.len()).trim_start(src).trim_end(src),
Span::new(2, 7)
);
}
}

View file

@ -3,8 +3,6 @@
f4f22fc:attribute key class order f4f22fc:attribute key class order
ae6fc15:bugged left/right quote ae6fc15:bugged left/right quote
168469a:bugged left/right quote 168469a:bugged left/right quote
2056174:unicode whitespace emph
2e8fffa:unicode whitespace strong
e1f5b5e:untrimmed whitespace before linebreak e1f5b5e:untrimmed whitespace before linebreak
07888f3:div close within raw block 07888f3:div close within raw block
8423412:heading id conflict with existing id 8423412:heading id conflict with existing id