From 55234bf1936ab5d3a519d6ad508a0b7747ce4411 Mon Sep 17 00:00:00 2001 From: Noah Hellman Date: Sat, 13 May 2023 12:13:33 +0200 Subject: [PATCH] block: consider only ascii whitespace --- src/block.rs | 140 +++++++++++++++++++++++++++------------------------ src/span.rs | 19 ++++--- 2 files changed, 86 insertions(+), 73 deletions(-) diff --git a/src/block.rs b/src/block.rs index 9bba714..077a6c9 100644 --- a/src/block.rs +++ b/src/block.rs @@ -356,10 +356,10 @@ impl<'s> TreeParser<'s> { for line in lines.iter_mut() { let indent_line = line .of(self.src) - .chars() - .take_while(|c| *c != '\n' && c.is_whitespace()) + .bytes() + .take_while(|c| *c != b'\n' && c.is_ascii_whitespace()) .count(); - *line = line.skip_chars((*indent).min(indent_line), self.src); + *line = line.skip((*indent).min(indent_line)); } } else { // trim starting whitespace of each inline @@ -425,7 +425,7 @@ impl<'s> TreeParser<'s> { // trim '#' characters for line in lines.iter_mut().skip(1) { - *line = line.trim_start_matches(self.src, |c| c == '#' || c.is_whitespace()); + *line = line.trim_start_matches(self.src, |c| c == '#' || c.is_ascii_whitespace()); } } @@ -449,28 +449,26 @@ impl<'s> TreeParser<'s> { // update spans, remove indentation / container prefix lines.iter_mut().skip(1).for_each(|sp| { let src = sp.of(self.src); - let src_t = src.trim(); - let spaces = src.chars().take_while(|c| c.is_whitespace()).count(); + let src_t = src.trim_matches(|c: char| c.is_ascii_whitespace()); + let whitespace = src_t.as_ptr() as usize - src.as_ptr() as usize; let skip = match k { Kind::Blockquote => { if src_t == ">" { - spaces + 1 + whitespace + 1 } else if src_t.starts_with('>') - && src_t.chars().nth(1).map_or(false, char::is_whitespace) + && src_t[1..].starts_with(|c: char| c.is_ascii_whitespace()) { - spaces + 1 + usize::from(src_t.len() > 1) + whitespace + 1 + usize::from(src_t.len() > 1) } else { 0 } } - Kind::ListItem { .. } | Kind::Definition { .. } => { - spaces.min(outer.of(self.src).chars().count()) - } - Kind::Fenced { indent, .. } => spaces.min(*indent), + Kind::ListItem { .. } | Kind::Definition { .. } => whitespace.min(outer.len()), + Kind::Fenced { indent, .. } => whitespace.min(*indent), _ => panic!("non-container {:?}", k), }; - let count = sp.of(self.src).chars().take_while(|c| *c != '\n').count(); - *sp = sp.skip_chars(skip.min(count), self.src); + let len = sp.of(self.src).bytes().take_while(|c| *c != b'\n').count(); + *sp = sp.skip(skip.min(len)); }); if let Kind::ListItem { ty, .. } = k { @@ -578,12 +576,14 @@ impl<'s> TreeParser<'s> { let caption_line = lines .iter() - .position(|sp| sp.of(self.src).trim_start().starts_with('^')) + .position(|sp| { + sp.of(self.src) + .trim_start_matches(|c: char| c.is_ascii_whitespace()) + .starts_with('^') + }) .map_or(lines.len(), |caption_line| { self.enter(Node::Leaf(Caption), span_start); - lines[caption_line] = lines[caption_line] - .trim_start(self.src) - .skip_chars(2, self.src); + lines[caption_line] = lines[caption_line].trim_start(self.src).skip(2); lines[lines.len() - 1] = lines[lines.len() - 1].trim_end(self.src); for line in &lines[caption_line..] { self.inline(*line); @@ -624,7 +624,7 @@ impl<'s> TreeParser<'s> { l => { matches!(cell.as_bytes()[0], b'-' | b':') && matches!(cell.as_bytes()[l - 1], b'-' | b':') - && cell.chars().skip(1).take(l - 2).all(|c| c == '-') + && cell.bytes().skip(1).take(l - 2).all(|c| c == b'-') } }; separator_row &= separator_cell; @@ -799,48 +799,46 @@ struct IdentifiedBlock<'s> { impl<'s> IdentifiedBlock<'s> { fn new(line: &'s str) -> Self { - let mut chars = line.chars(); - let indent = chars - .clone() - .take_while(|c| *c != '\n' && c.is_whitespace()) - .count(); - (&mut chars).take(indent).last(); - let indent_bytes = line.len() - chars.as_str().len(); - let line = chars.as_str(); - let line_t = line.trim_end(); + let l = line.len(); + + let line = line.trim_start_matches(|c: char| c.is_ascii_whitespace() && c != '\n'); + let indent = l - line.len(); + let line_t = line.trim_end_matches(|c: char| c.is_ascii_whitespace()); + let l = line.len(); let lt = line_t.len(); + let mut chars = line.chars(); let first = if let Some(c) = chars.next() { c } else { return Self { kind: Kind::Atom(Blankline), - span: Span::empty_at(indent_bytes), + span: Span::empty_at(indent), }; }; match first { - '\n' => Some((Kind::Atom(Blankline), Span::by_len(indent_bytes, 1))), + '\n' => Some((Kind::Atom(Blankline), Span::by_len(indent, 1))), '#' => chars .find(|c| *c != '#') - .map_or(true, char::is_whitespace) + .map_or(true, |c| c.is_ascii_whitespace()) .then(|| { - let level = line.chars().take_while(|c| *c == '#').count(); - (Kind::Heading { level }, Span::by_len(indent_bytes, level)) + let level = line.bytes().take_while(|c| *c == b'#').count(); + (Kind::Heading { level }, Span::by_len(indent, level)) }), '>' => { - if chars.next().map_or(true, char::is_whitespace) { - Some((Kind::Blockquote, Span::by_len(indent_bytes, 1))) + if chars.next().map_or(true, |c| c.is_ascii_whitespace()) { + Some((Kind::Blockquote, Span::by_len(indent, 1))) } else { None } } '{' => (attr::valid(line.chars()) == lt) - .then(|| (Kind::Atom(Attributes), Span::by_len(indent_bytes, l))), + .then(|| (Kind::Atom(Attributes), Span::by_len(indent, l))), '|' => { if lt >= 2 && line_t.ends_with('|') && !line_t.ends_with("\\|") { - Some((Kind::Table { caption: false }, Span::empty_at(indent_bytes))) + Some((Kind::Table { caption: false }, Span::empty_at(indent))) } else { None } @@ -854,17 +852,17 @@ impl<'s> IdentifiedBlock<'s> { footnote, label: &label[usize::from(footnote)..], }, - Span::by_len(0, indent_bytes + 3 + l), + Span::by_len(0, indent + 3 + l), ) }), '-' | '*' if Self::is_thematic_break(chars.clone()) => { - Some((Kind::Atom(ThematicBreak), Span::by_len(indent_bytes, lt))) + Some((Kind::Atom(ThematicBreak), Span::by_len(indent, lt))) } b @ ('-' | '*' | '+') => chars.next().map_or(true, |c| c == ' ').then(|| { let task_list = chars.next() == Some('[') && matches!(chars.next(), Some('x' | 'X' | ' ')) && chars.next() == Some(']') - && chars.next().map_or(true, char::is_whitespace); + && chars.next().map_or(true, |c| c.is_ascii_whitespace()); if task_list { ( Kind::ListItem { @@ -872,7 +870,7 @@ impl<'s> IdentifiedBlock<'s> { ty: Task, last_blankline: false, }, - Span::by_len(indent_bytes, 5), + Span::by_len(indent, 5), ) } else { ( @@ -881,25 +879,33 @@ impl<'s> IdentifiedBlock<'s> { ty: Unordered(b as u8), last_blankline: false, }, - Span::by_len(indent_bytes, 1), + Span::by_len(indent, 1), ) } }), - ':' if chars.clone().next().map_or(true, char::is_whitespace) => Some(( - Kind::ListItem { - indent, - ty: Description, - last_blankline: false, - }, - Span::by_len(indent_bytes, 1), - )), + ':' if chars + .clone() + .next() + .map_or(true, |c| c.is_ascii_whitespace()) => + { + Some(( + Kind::ListItem { + indent, + ty: Description, + last_blankline: false, + }, + Span::by_len(indent, 1), + )) + } f @ ('`' | ':' | '~') => { let fence_length = 1 + (&mut chars).take_while(|c| *c == f).count(); - let spec = &line_t[fence_length..].trim_start(); + let spec = + &line_t[fence_length..].trim_start_matches(|c: char| c.is_ascii_whitespace()); let valid_spec = if f == ':' { spec.chars().all(attr::is_name) } else { - !spec.chars().any(char::is_whitespace) && !spec.chars().any(|c| c == '`') + !spec.chars().any(|c| c.is_ascii_whitespace()) + && !spec.chars().any(|c| c == '`') }; (valid_spec && fence_length >= 3).then(|| { ( @@ -913,7 +919,7 @@ impl<'s> IdentifiedBlock<'s> { spec, has_closing_fence: false, }, - Span::by_len(indent_bytes, line.len()), + Span::by_len(indent, line.len()), ) }) } @@ -924,14 +930,14 @@ impl<'s> IdentifiedBlock<'s> { ty: Ordered(num, style), last_blankline: false, }, - Span::by_len(indent_bytes, len), + Span::by_len(indent, len), ) }), } .map(|(kind, span)| Self { kind, span }) .unwrap_or(Self { kind: Kind::Paragraph, - span: Span::empty_at(indent_bytes), + span: Span::empty_at(indent), }) } @@ -940,7 +946,7 @@ impl<'s> IdentifiedBlock<'s> { for c in chars { if matches!(c, '-' | '*') { n += 1; - } else if !c.is_whitespace() { + } else if !c.is_ascii_whitespace() { return false; } } @@ -1023,7 +1029,7 @@ impl<'s> IdentifiedBlock<'s> { numbering }; - if chars.next().map_or(true, char::is_whitespace) { + if chars.next().map_or(true, |c| c.is_ascii_whitespace()) { Some((numbering, style, len_num + len_style)) } else { None @@ -1054,18 +1060,19 @@ impl<'s> Kind<'s> { last_blankline, .. } => { - let spaces = line.chars().take_while(|c| c.is_whitespace()).count(); + let line_t = line.trim_start_matches(|c: char| c.is_ascii_whitespace()); + let whitespace = line.len() - line_t.len(); let para = !*last_blankline && matches!(next, Self::Paragraph); - let blankline = matches!(next, Self::Atom(Blankline)); - *last_blankline = blankline; - blankline || spaces > *indent || para + *last_blankline = matches!(next, Self::Atom(Blankline)); + *last_blankline || whitespace > *indent || para } Self::Definition { indent, footnote, .. } => { if *footnote { - let spaces = line.chars().take_while(|c| c.is_whitespace()).count(); - matches!(next, Self::Atom(Blankline)) || spaces > *indent + let line_t = line.trim_start_matches(|c: char| c.is_ascii_whitespace()); + let whitespace = line.len() - line_t.len(); + matches!(next, Self::Atom(Blankline)) || whitespace > *indent } else { line.starts_with(' ') && !matches!(next, Self::Atom(Blankline)) } @@ -1093,7 +1100,10 @@ impl<'s> Kind<'s> { } Self::Table { caption } => { matches!(next, Self::Table { .. } | Self::Atom(Blankline)) || { - if line.trim().starts_with("^ ") { + if line + .trim_matches(|c: char| c.is_ascii_whitespace()) + .starts_with("^ ") + { *caption = true; true } else { diff --git a/src/span.rs b/src/span.rs index 722281e..73ad3a7 100644 --- a/src/span.rs +++ b/src/span.rs @@ -85,25 +85,28 @@ impl Span { &s[self.start()..self.end()] } - pub fn skip_chars(self, n: usize, s: &str) -> Self { - let n_bytes: usize = self.of(s).chars().take(n).map(char::len_utf8).sum(); - Self::new(self.start() + n_bytes, self.end()) - } - pub fn trim_start_matches bool>(self, s: &str, pat: P) -> Self { Self::from_slice(s, self.of(s).trim_start_matches(pat)) } pub fn trim_start(self, s: &str) -> Self { - Self::from_slice(s, self.of(s).trim_start()) + Self::from_slice( + s, + self.of(s) + .trim_start_matches(|c: char| c.is_ascii_whitespace()), + ) } pub fn trim_end(self, s: &str) -> Self { - Self::from_slice(s, self.of(s).trim_end()) + Self::from_slice( + s, + self.of(s) + .trim_end_matches(|c: char| c.is_ascii_whitespace()), + ) } pub fn trim(self, s: &str) -> Self { - Self::from_slice(s, self.of(s).trim_start().trim_end()) + self.trim_start(s).trim_end(s) } fn from_slice(s: &str, slice: &str) -> Self {