attr: use bytes instead of chars

only consider ascii whitespace
This commit is contained in:
Noah Hellman 2023-05-13 12:35:17 +02:00
parent 798f8941d8
commit 72a3378831
2 changed files with 43 additions and 49 deletions

View file

@ -8,13 +8,13 @@ pub(crate) fn parse(src: &str) -> Attributes {
a
}
pub fn valid<I: Iterator<Item = char>>(chars: I) -> usize {
pub fn valid(src: &str) -> usize {
use State::*;
let mut n = 0;
let mut state = Start;
for c in chars {
n += c.len_utf8();
for c in src.bytes() {
n += 1;
state = state.step(c);
match state {
Done | Invalid => break,
@ -256,11 +256,11 @@ impl Validator {
/// Returns number of valid bytes parsed (0 means invalid) if finished, otherwise more input is
/// needed.
pub fn parse(&mut self, input: &str) -> Option<usize> {
let mut chars = input.chars();
for c in &mut chars {
let mut bytes = input.bytes();
for c in &mut bytes {
self.state = self.state.step(c);
match self.state {
State::Done => return Some(input.len() - chars.as_str().len()),
State::Done => return Some(input.len() - bytes.len()),
State::Invalid => return Some(0),
_ => {}
}
@ -297,7 +297,7 @@ impl<'s> Parser<'s> {
let mut pos = 0;
let mut pos_prev = 0;
for c in input.chars() {
for c in input.bytes() {
let state_next = self.state.step(c);
let st = std::mem::replace(&mut self.state, state_next);
@ -318,7 +318,7 @@ impl<'s> Parser<'s> {
}
};
pos += c.len_utf8();
pos += 1;
debug_assert!(!matches!(self.state, Invalid));
@ -358,40 +358,40 @@ enum State {
}
impl State {
fn step(self, c: char) -> State {
fn step(self, c: u8) -> State {
use State::*;
match self {
Start if c == '{' => Whitespace,
Start if c == b'{' => Whitespace,
Start => Invalid,
Whitespace => match c {
'}' => Done,
'.' => ClassFirst,
'#' => IdentifierFirst,
'%' => Comment,
b'}' => Done,
b'.' => ClassFirst,
b'#' => IdentifierFirst,
b'%' => Comment,
c if is_name(c) => Key,
c if c.is_whitespace() => Whitespace,
c if c.is_ascii_whitespace() => Whitespace,
_ => Invalid,
},
Comment if c == '%' => Whitespace,
Comment if c == b'%' => Whitespace,
Comment => Comment,
ClassFirst if is_name(c) => Class,
ClassFirst => Invalid,
IdentifierFirst if is_name(c) => Identifier,
IdentifierFirst => Invalid,
s @ (Class | Identifier | Value) if is_name(c) => s,
Class | Identifier | Value if c.is_whitespace() => Whitespace,
Class | Identifier | Value if c == '}' => Done,
Class | Identifier | Value if c.is_ascii_whitespace() => Whitespace,
Class | Identifier | Value if c == b'}' => Done,
Class | Identifier | Value => Invalid,
Key if is_name(c) => Key,
Key if c == '=' => ValueFirst,
Key if c == b'=' => ValueFirst,
Key => Invalid,
ValueFirst if is_name(c) => Value,
ValueFirst if c == '"' => ValueQuoted,
ValueFirst if c == b'"' => ValueQuoted,
ValueFirst => Invalid,
ValueQuoted | ValueNewline | ValueContinued if c == '"' => Whitespace,
ValueQuoted | ValueNewline | ValueContinued | ValueEscape if c == '\n' => ValueNewline,
ValueQuoted if c == '\\' => ValueEscape,
ValueQuoted | ValueNewline | ValueContinued if c == b'"' => Whitespace,
ValueQuoted | ValueNewline | ValueContinued | ValueEscape if c == b'\n' => ValueNewline,
ValueQuoted if c == b'\\' => ValueEscape,
ValueQuoted | ValueEscape => ValueQuoted,
ValueNewline | ValueContinued => ValueContinued,
Invalid | Done => panic!("{:?}", self),
@ -399,8 +399,8 @@ impl State {
}
}
pub fn is_name(c: char) -> bool {
c.is_ascii_alphanumeric() || matches!(c, ':' | '_' | '-')
pub fn is_name(c: u8) -> bool {
c.is_ascii_alphanumeric() || matches!(c, b':' | b'_' | b'-')
}
#[cfg(test)]
@ -435,11 +435,6 @@ mod test {
test_attr!("{#a #b}", ("id", "b"));
}
#[test]
fn unicode_whitespace() {
test_attr!("{.a .b}", ("class", "a b"));
}
#[test]
fn value_unquoted() {
test_attr!(
@ -517,47 +512,45 @@ mod test {
#[test]
fn valid_full() {
let src = "{.class %comment%}";
assert_eq!(super::valid(src.chars()), src.len());
assert_eq!(super::valid(src), src.len());
}
#[test]
fn valid_unicode() {
let src = r#"{a="б"}"#;
assert_eq!(super::valid(src.chars()), src.len());
assert_eq!(super::valid(src), src.len());
}
#[test]
fn valid_empty() {
let src = "{}";
assert_eq!(super::valid(src.chars()), src.len());
assert_eq!(super::valid(src), src.len());
}
#[test]
fn valid_whitespace() {
let src = "{ \n }";
assert_eq!(super::valid(src.chars()), src.len());
assert_eq!(super::valid(src), src.len());
}
#[test]
fn valid_comment() {
let src = "{%comment%}";
assert_eq!(super::valid(src.chars()), src.len());
assert_eq!(super::valid(src), src.len());
}
#[test]
fn valid_trailing() {
let src = "{.class}";
assert_eq!(
super::valid(src.chars().chain("{.ignore}".chars())),
src.len(),
);
let src = "{.class}{.ignore}";
let src_valid = "{.class}";
assert_eq!(super::valid(src), src_valid.len());
}
#[test]
fn valid_invalid() {
assert_eq!(super::valid(" {.valid}".chars()), 0);
assert_eq!(super::valid("{.class invalid}".chars()), 0);
assert_eq!(super::valid("abc".chars()), 0);
assert_eq!(super::valid("{.abc.}".chars()), 0);
assert_eq!(super::valid(" {.valid}"), 0);
assert_eq!(super::valid("{.class invalid}"), 0);
assert_eq!(super::valid("abc"), 0);
assert_eq!(super::valid("{.abc.}"), 0);
}
}

View file

@ -834,8 +834,9 @@ impl<'s> IdentifiedBlock<'s> {
None
}
}
'{' => (attr::valid(line.chars()) == lt)
.then(|| (Kind::Atom(Attributes), Span::by_len(indent, l))),
'{' => {
(attr::valid(line) == lt).then(|| (Kind::Atom(Attributes), Span::by_len(indent, l)))
}
'|' => {
if lt >= 2 && line_t.ends_with('|') && !line_t.ends_with("\\|") {
Some((Kind::Table { caption: false }, Span::empty_at(indent)))
@ -902,10 +903,10 @@ impl<'s> IdentifiedBlock<'s> {
let spec =
&line_t[fence_length..].trim_start_matches(|c: char| c.is_ascii_whitespace());
let valid_spec = if f == ':' {
spec.chars().all(attr::is_name)
spec.bytes().all(attr::is_name)
} else {
!spec.chars().any(|c| c.is_ascii_whitespace())
&& !spec.chars().any(|c| c == '`')
!spec.bytes().any(|c| c.is_ascii_whitespace())
&& !spec.bytes().any(|c| c == b'`')
};
(valid_spec && fence_length >= 3).then(|| {
(