block: consider only ascii whitespace

This commit is contained in:
Noah Hellman 2023-05-13 12:13:33 +02:00
parent d43d6c908f
commit 55234bf193
2 changed files with 86 additions and 73 deletions

View file

@ -356,10 +356,10 @@ impl<'s> TreeParser<'s> {
for line in lines.iter_mut() { for line in lines.iter_mut() {
let indent_line = line let indent_line = line
.of(self.src) .of(self.src)
.chars() .bytes()
.take_while(|c| *c != '\n' && c.is_whitespace()) .take_while(|c| *c != b'\n' && c.is_ascii_whitespace())
.count(); .count();
*line = line.skip_chars((*indent).min(indent_line), self.src); *line = line.skip((*indent).min(indent_line));
} }
} else { } else {
// trim starting whitespace of each inline // trim starting whitespace of each inline
@ -425,7 +425,7 @@ impl<'s> TreeParser<'s> {
// trim '#' characters // trim '#' characters
for line in lines.iter_mut().skip(1) { for line in lines.iter_mut().skip(1) {
*line = line.trim_start_matches(self.src, |c| c == '#' || c.is_whitespace()); *line = line.trim_start_matches(self.src, |c| c == '#' || c.is_ascii_whitespace());
} }
} }
@ -449,28 +449,26 @@ impl<'s> TreeParser<'s> {
// update spans, remove indentation / container prefix // update spans, remove indentation / container prefix
lines.iter_mut().skip(1).for_each(|sp| { lines.iter_mut().skip(1).for_each(|sp| {
let src = sp.of(self.src); let src = sp.of(self.src);
let src_t = src.trim(); let src_t = src.trim_matches(|c: char| c.is_ascii_whitespace());
let spaces = src.chars().take_while(|c| c.is_whitespace()).count(); let whitespace = src_t.as_ptr() as usize - src.as_ptr() as usize;
let skip = match k { let skip = match k {
Kind::Blockquote => { Kind::Blockquote => {
if src_t == ">" { if src_t == ">" {
spaces + 1 whitespace + 1
} else if src_t.starts_with('>') } else if src_t.starts_with('>')
&& src_t.chars().nth(1).map_or(false, char::is_whitespace) && src_t[1..].starts_with(|c: char| c.is_ascii_whitespace())
{ {
spaces + 1 + usize::from(src_t.len() > 1) whitespace + 1 + usize::from(src_t.len() > 1)
} else { } else {
0 0
} }
} }
Kind::ListItem { .. } | Kind::Definition { .. } => { Kind::ListItem { .. } | Kind::Definition { .. } => whitespace.min(outer.len()),
spaces.min(outer.of(self.src).chars().count()) Kind::Fenced { indent, .. } => whitespace.min(*indent),
}
Kind::Fenced { indent, .. } => spaces.min(*indent),
_ => panic!("non-container {:?}", k), _ => panic!("non-container {:?}", k),
}; };
let count = sp.of(self.src).chars().take_while(|c| *c != '\n').count(); let len = sp.of(self.src).bytes().take_while(|c| *c != b'\n').count();
*sp = sp.skip_chars(skip.min(count), self.src); *sp = sp.skip(skip.min(len));
}); });
if let Kind::ListItem { ty, .. } = k { if let Kind::ListItem { ty, .. } = k {
@ -578,12 +576,14 @@ impl<'s> TreeParser<'s> {
let caption_line = lines let caption_line = lines
.iter() .iter()
.position(|sp| sp.of(self.src).trim_start().starts_with('^')) .position(|sp| {
sp.of(self.src)
.trim_start_matches(|c: char| c.is_ascii_whitespace())
.starts_with('^')
})
.map_or(lines.len(), |caption_line| { .map_or(lines.len(), |caption_line| {
self.enter(Node::Leaf(Caption), span_start); self.enter(Node::Leaf(Caption), span_start);
lines[caption_line] = lines[caption_line] lines[caption_line] = lines[caption_line].trim_start(self.src).skip(2);
.trim_start(self.src)
.skip_chars(2, self.src);
lines[lines.len() - 1] = lines[lines.len() - 1].trim_end(self.src); lines[lines.len() - 1] = lines[lines.len() - 1].trim_end(self.src);
for line in &lines[caption_line..] { for line in &lines[caption_line..] {
self.inline(*line); self.inline(*line);
@ -624,7 +624,7 @@ impl<'s> TreeParser<'s> {
l => { l => {
matches!(cell.as_bytes()[0], b'-' | b':') matches!(cell.as_bytes()[0], b'-' | b':')
&& matches!(cell.as_bytes()[l - 1], b'-' | b':') && matches!(cell.as_bytes()[l - 1], b'-' | b':')
&& cell.chars().skip(1).take(l - 2).all(|c| c == '-') && cell.bytes().skip(1).take(l - 2).all(|c| c == b'-')
} }
}; };
separator_row &= separator_cell; separator_row &= separator_cell;
@ -799,48 +799,46 @@ struct IdentifiedBlock<'s> {
impl<'s> IdentifiedBlock<'s> { impl<'s> IdentifiedBlock<'s> {
fn new(line: &'s str) -> Self { fn new(line: &'s str) -> Self {
let mut chars = line.chars(); let l = line.len();
let indent = chars
.clone() let line = line.trim_start_matches(|c: char| c.is_ascii_whitespace() && c != '\n');
.take_while(|c| *c != '\n' && c.is_whitespace()) let indent = l - line.len();
.count(); let line_t = line.trim_end_matches(|c: char| c.is_ascii_whitespace());
(&mut chars).take(indent).last();
let indent_bytes = line.len() - chars.as_str().len();
let line = chars.as_str();
let line_t = line.trim_end();
let l = line.len(); let l = line.len();
let lt = line_t.len(); let lt = line_t.len();
let mut chars = line.chars();
let first = if let Some(c) = chars.next() { let first = if let Some(c) = chars.next() {
c c
} else { } else {
return Self { return Self {
kind: Kind::Atom(Blankline), kind: Kind::Atom(Blankline),
span: Span::empty_at(indent_bytes), span: Span::empty_at(indent),
}; };
}; };
match first { match first {
'\n' => Some((Kind::Atom(Blankline), Span::by_len(indent_bytes, 1))), '\n' => Some((Kind::Atom(Blankline), Span::by_len(indent, 1))),
'#' => chars '#' => chars
.find(|c| *c != '#') .find(|c| *c != '#')
.map_or(true, char::is_whitespace) .map_or(true, |c| c.is_ascii_whitespace())
.then(|| { .then(|| {
let level = line.chars().take_while(|c| *c == '#').count(); let level = line.bytes().take_while(|c| *c == b'#').count();
(Kind::Heading { level }, Span::by_len(indent_bytes, level)) (Kind::Heading { level }, Span::by_len(indent, level))
}), }),
'>' => { '>' => {
if chars.next().map_or(true, char::is_whitespace) { if chars.next().map_or(true, |c| c.is_ascii_whitespace()) {
Some((Kind::Blockquote, Span::by_len(indent_bytes, 1))) Some((Kind::Blockquote, Span::by_len(indent, 1)))
} else { } else {
None None
} }
} }
'{' => (attr::valid(line.chars()) == lt) '{' => (attr::valid(line.chars()) == lt)
.then(|| (Kind::Atom(Attributes), Span::by_len(indent_bytes, l))), .then(|| (Kind::Atom(Attributes), Span::by_len(indent, l))),
'|' => { '|' => {
if lt >= 2 && line_t.ends_with('|') && !line_t.ends_with("\\|") { if lt >= 2 && line_t.ends_with('|') && !line_t.ends_with("\\|") {
Some((Kind::Table { caption: false }, Span::empty_at(indent_bytes))) Some((Kind::Table { caption: false }, Span::empty_at(indent)))
} else { } else {
None None
} }
@ -854,17 +852,17 @@ impl<'s> IdentifiedBlock<'s> {
footnote, footnote,
label: &label[usize::from(footnote)..], label: &label[usize::from(footnote)..],
}, },
Span::by_len(0, indent_bytes + 3 + l), Span::by_len(0, indent + 3 + l),
) )
}), }),
'-' | '*' if Self::is_thematic_break(chars.clone()) => { '-' | '*' if Self::is_thematic_break(chars.clone()) => {
Some((Kind::Atom(ThematicBreak), Span::by_len(indent_bytes, lt))) Some((Kind::Atom(ThematicBreak), Span::by_len(indent, lt)))
} }
b @ ('-' | '*' | '+') => chars.next().map_or(true, |c| c == ' ').then(|| { b @ ('-' | '*' | '+') => chars.next().map_or(true, |c| c == ' ').then(|| {
let task_list = chars.next() == Some('[') let task_list = chars.next() == Some('[')
&& matches!(chars.next(), Some('x' | 'X' | ' ')) && matches!(chars.next(), Some('x' | 'X' | ' '))
&& chars.next() == Some(']') && chars.next() == Some(']')
&& chars.next().map_or(true, char::is_whitespace); && chars.next().map_or(true, |c| c.is_ascii_whitespace());
if task_list { if task_list {
( (
Kind::ListItem { Kind::ListItem {
@ -872,7 +870,7 @@ impl<'s> IdentifiedBlock<'s> {
ty: Task, ty: Task,
last_blankline: false, last_blankline: false,
}, },
Span::by_len(indent_bytes, 5), Span::by_len(indent, 5),
) )
} else { } else {
( (
@ -881,25 +879,33 @@ impl<'s> IdentifiedBlock<'s> {
ty: Unordered(b as u8), ty: Unordered(b as u8),
last_blankline: false, last_blankline: false,
}, },
Span::by_len(indent_bytes, 1), Span::by_len(indent, 1),
) )
} }
}), }),
':' if chars.clone().next().map_or(true, char::is_whitespace) => Some(( ':' if chars
Kind::ListItem { .clone()
indent, .next()
ty: Description, .map_or(true, |c| c.is_ascii_whitespace()) =>
last_blankline: false, {
}, Some((
Span::by_len(indent_bytes, 1), Kind::ListItem {
)), indent,
ty: Description,
last_blankline: false,
},
Span::by_len(indent, 1),
))
}
f @ ('`' | ':' | '~') => { f @ ('`' | ':' | '~') => {
let fence_length = 1 + (&mut chars).take_while(|c| *c == f).count(); let fence_length = 1 + (&mut chars).take_while(|c| *c == f).count();
let spec = &line_t[fence_length..].trim_start(); let spec =
&line_t[fence_length..].trim_start_matches(|c: char| c.is_ascii_whitespace());
let valid_spec = if f == ':' { let valid_spec = if f == ':' {
spec.chars().all(attr::is_name) spec.chars().all(attr::is_name)
} else { } else {
!spec.chars().any(char::is_whitespace) && !spec.chars().any(|c| c == '`') !spec.chars().any(|c| c.is_ascii_whitespace())
&& !spec.chars().any(|c| c == '`')
}; };
(valid_spec && fence_length >= 3).then(|| { (valid_spec && fence_length >= 3).then(|| {
( (
@ -913,7 +919,7 @@ impl<'s> IdentifiedBlock<'s> {
spec, spec,
has_closing_fence: false, has_closing_fence: false,
}, },
Span::by_len(indent_bytes, line.len()), Span::by_len(indent, line.len()),
) )
}) })
} }
@ -924,14 +930,14 @@ impl<'s> IdentifiedBlock<'s> {
ty: Ordered(num, style), ty: Ordered(num, style),
last_blankline: false, last_blankline: false,
}, },
Span::by_len(indent_bytes, len), Span::by_len(indent, len),
) )
}), }),
} }
.map(|(kind, span)| Self { kind, span }) .map(|(kind, span)| Self { kind, span })
.unwrap_or(Self { .unwrap_or(Self {
kind: Kind::Paragraph, kind: Kind::Paragraph,
span: Span::empty_at(indent_bytes), span: Span::empty_at(indent),
}) })
} }
@ -940,7 +946,7 @@ impl<'s> IdentifiedBlock<'s> {
for c in chars { for c in chars {
if matches!(c, '-' | '*') { if matches!(c, '-' | '*') {
n += 1; n += 1;
} else if !c.is_whitespace() { } else if !c.is_ascii_whitespace() {
return false; return false;
} }
} }
@ -1023,7 +1029,7 @@ impl<'s> IdentifiedBlock<'s> {
numbering numbering
}; };
if chars.next().map_or(true, char::is_whitespace) { if chars.next().map_or(true, |c| c.is_ascii_whitespace()) {
Some((numbering, style, len_num + len_style)) Some((numbering, style, len_num + len_style))
} else { } else {
None None
@ -1054,18 +1060,19 @@ impl<'s> Kind<'s> {
last_blankline, last_blankline,
.. ..
} => { } => {
let spaces = line.chars().take_while(|c| c.is_whitespace()).count(); let line_t = line.trim_start_matches(|c: char| c.is_ascii_whitespace());
let whitespace = line.len() - line_t.len();
let para = !*last_blankline && matches!(next, Self::Paragraph); let para = !*last_blankline && matches!(next, Self::Paragraph);
let blankline = matches!(next, Self::Atom(Blankline)); *last_blankline = matches!(next, Self::Atom(Blankline));
*last_blankline = blankline; *last_blankline || whitespace > *indent || para
blankline || spaces > *indent || para
} }
Self::Definition { Self::Definition {
indent, footnote, .. indent, footnote, ..
} => { } => {
if *footnote { if *footnote {
let spaces = line.chars().take_while(|c| c.is_whitespace()).count(); let line_t = line.trim_start_matches(|c: char| c.is_ascii_whitespace());
matches!(next, Self::Atom(Blankline)) || spaces > *indent let whitespace = line.len() - line_t.len();
matches!(next, Self::Atom(Blankline)) || whitespace > *indent
} else { } else {
line.starts_with(' ') && !matches!(next, Self::Atom(Blankline)) line.starts_with(' ') && !matches!(next, Self::Atom(Blankline))
} }
@ -1093,7 +1100,10 @@ impl<'s> Kind<'s> {
} }
Self::Table { caption } => { Self::Table { caption } => {
matches!(next, Self::Table { .. } | Self::Atom(Blankline)) || { matches!(next, Self::Table { .. } | Self::Atom(Blankline)) || {
if line.trim().starts_with("^ ") { if line
.trim_matches(|c: char| c.is_ascii_whitespace())
.starts_with("^ ")
{
*caption = true; *caption = true;
true true
} else { } else {

View file

@ -85,25 +85,28 @@ impl Span {
&s[self.start()..self.end()] &s[self.start()..self.end()]
} }
pub fn skip_chars(self, n: usize, s: &str) -> Self {
let n_bytes: usize = self.of(s).chars().take(n).map(char::len_utf8).sum();
Self::new(self.start() + n_bytes, self.end())
}
pub fn trim_start_matches<P: FnMut(char) -> bool>(self, s: &str, pat: P) -> Self { pub fn trim_start_matches<P: FnMut(char) -> bool>(self, s: &str, pat: P) -> Self {
Self::from_slice(s, self.of(s).trim_start_matches(pat)) Self::from_slice(s, self.of(s).trim_start_matches(pat))
} }
pub fn trim_start(self, s: &str) -> Self { pub fn trim_start(self, s: &str) -> Self {
Self::from_slice(s, self.of(s).trim_start()) Self::from_slice(
s,
self.of(s)
.trim_start_matches(|c: char| c.is_ascii_whitespace()),
)
} }
pub fn trim_end(self, s: &str) -> Self { pub fn trim_end(self, s: &str) -> Self {
Self::from_slice(s, self.of(s).trim_end()) Self::from_slice(
s,
self.of(s)
.trim_end_matches(|c: char| c.is_ascii_whitespace()),
)
} }
pub fn trim(self, s: &str) -> Self { pub fn trim(self, s: &str) -> Self {
Self::from_slice(s, self.of(s).trim_start().trim_end()) self.trim_start(s).trim_end(s)
} }
fn from_slice(s: &str, slice: &str) -> Self { fn from_slice(s: &str, slice: &str) -> Self {