attr: use bytes instead of chars

only consider ascii whitespace
2023-05-13 12:35:17 +02:00 · 2023-05-13 12:35:17 +02:00 · 72a3378831
commit 72a3378831
parent 798f8941d8
2 changed files with 43 additions and 49 deletions
--- a/src/attr.rs
+++ b/src/attr.rs
@ -8,13 +8,13 @@ pub(crate) fn parse(src: &str) -> Attributes {
    a
 }

-pub fn valid<I: Iterator<Item = char>>(chars: I) -> usize {
+pub fn valid(src: &str) -> usize {
    use State::*;

    let mut n = 0;
    let mut state = Start;
-    for c in chars {
-        n += c.len_utf8();
+    for c in src.bytes() {
+        n += 1;
        state = state.step(c);
        match state {
            Done | Invalid => break,
@ -256,11 +256,11 @@ impl Validator {
    /// Returns number of valid bytes parsed (0 means invalid) if finished, otherwise more input is
    /// needed.
    pub fn parse(&mut self, input: &str) -> Option<usize> {
-        let mut chars = input.chars();
-        for c in &mut chars {
+        let mut bytes = input.bytes();
+        for c in &mut bytes {
            self.state = self.state.step(c);
            match self.state {
-                State::Done => return Some(input.len() - chars.as_str().len()),
+                State::Done => return Some(input.len() - bytes.len()),
                State::Invalid => return Some(0),
                _ => {}
            }
@ -297,7 +297,7 @@ impl<'s> Parser<'s> {
        let mut pos = 0;
        let mut pos_prev = 0;

-        for c in input.chars() {
+        for c in input.bytes() {
            let state_next = self.state.step(c);
            let st = std::mem::replace(&mut self.state, state_next);

@ -318,7 +318,7 @@ impl<'s> Parser<'s> {
                }
            };

-            pos += c.len_utf8();
+            pos += 1;

            debug_assert!(!matches!(self.state, Invalid));

@ -358,40 +358,40 @@ enum State {
 }

 impl State {
-    fn step(self, c: char) -> State {
+    fn step(self, c: u8) -> State {
        use State::*;

        match self {
-            Start if c == '{' => Whitespace,
+            Start if c == b'{' => Whitespace,
            Start => Invalid,
            Whitespace => match c {
-                '}' => Done,
-                '.' => ClassFirst,
-                '#' => IdentifierFirst,
-                '%' => Comment,
+                b'}' => Done,
+                b'.' => ClassFirst,
+                b'#' => IdentifierFirst,
+                b'%' => Comment,
                c if is_name(c) => Key,
-                c if c.is_whitespace() => Whitespace,
+                c if c.is_ascii_whitespace() => Whitespace,
                _ => Invalid,
            },
-            Comment if c == '%' => Whitespace,
+            Comment if c == b'%' => Whitespace,
            Comment => Comment,
            ClassFirst if is_name(c) => Class,
            ClassFirst => Invalid,
            IdentifierFirst if is_name(c) => Identifier,
            IdentifierFirst => Invalid,
            s @ (Class | Identifier | Value) if is_name(c) => s,
-            Class | Identifier | Value if c.is_whitespace() => Whitespace,
-            Class | Identifier | Value if c == '}' => Done,
+            Class | Identifier | Value if c.is_ascii_whitespace() => Whitespace,
+            Class | Identifier | Value if c == b'}' => Done,
            Class | Identifier | Value => Invalid,
            Key if is_name(c) => Key,
-            Key if c == '=' => ValueFirst,
+            Key if c == b'=' => ValueFirst,
            Key => Invalid,
            ValueFirst if is_name(c) => Value,
-            ValueFirst if c == '"' => ValueQuoted,
+            ValueFirst if c == b'"' => ValueQuoted,
            ValueFirst => Invalid,
-            ValueQuoted | ValueNewline | ValueContinued if c == '"' => Whitespace,
-            ValueQuoted | ValueNewline | ValueContinued | ValueEscape if c == '\n' => ValueNewline,
-            ValueQuoted if c == '\\' => ValueEscape,
+            ValueQuoted | ValueNewline | ValueContinued if c == b'"' => Whitespace,
+            ValueQuoted | ValueNewline | ValueContinued | ValueEscape if c == b'\n' => ValueNewline,
+            ValueQuoted if c == b'\\' => ValueEscape,
            ValueQuoted | ValueEscape => ValueQuoted,
            ValueNewline | ValueContinued => ValueContinued,
            Invalid | Done => panic!("{:?}", self),
@ -399,8 +399,8 @@ impl State {
    }
 }

-pub fn is_name(c: char) -> bool {
-    c.is_ascii_alphanumeric() || matches!(c, ':' | '_' | '-')
+pub fn is_name(c: u8) -> bool {
+    c.is_ascii_alphanumeric() || matches!(c, b':' | b'_' | b'-')
 }

 #[cfg(test)]
@ -435,11 +435,6 @@ mod test {
        test_attr!("{#a #b}", ("id", "b"));
    }

-    #[test]
-    fn unicode_whitespace() {
-        test_attr!("{.a .b}", ("class", "a b"));
-    }
-
    #[test]
    fn value_unquoted() {
        test_attr!(
@ -517,47 +512,45 @@ mod test {
    #[test]
    fn valid_full() {
        let src = "{.class %comment%}";
-        assert_eq!(super::valid(src.chars()), src.len());
+        assert_eq!(super::valid(src), src.len());
    }

    #[test]
    fn valid_unicode() {
        let src = r#"{a="б"}"#;
-        assert_eq!(super::valid(src.chars()), src.len());
+        assert_eq!(super::valid(src), src.len());
    }

    #[test]
    fn valid_empty() {
        let src = "{}";
-        assert_eq!(super::valid(src.chars()), src.len());
+        assert_eq!(super::valid(src), src.len());
    }

    #[test]
    fn valid_whitespace() {
        let src = "{ \n }";
-        assert_eq!(super::valid(src.chars()), src.len());
+        assert_eq!(super::valid(src), src.len());
    }

    #[test]
    fn valid_comment() {
        let src = "{%comment%}";
-        assert_eq!(super::valid(src.chars()), src.len());
+        assert_eq!(super::valid(src), src.len());
    }

    #[test]
    fn valid_trailing() {
-        let src = "{.class}";
-        assert_eq!(
-            super::valid(src.chars().chain("{.ignore}".chars())),
-            src.len(),
-        );
+        let src = "{.class}{.ignore}";
+        let src_valid = "{.class}";
+        assert_eq!(super::valid(src), src_valid.len());
    }

    #[test]
    fn valid_invalid() {
-        assert_eq!(super::valid(" {.valid}".chars()), 0);
-        assert_eq!(super::valid("{.class invalid}".chars()), 0);
-        assert_eq!(super::valid("abc".chars()), 0);
-        assert_eq!(super::valid("{.abc.}".chars()), 0);
+        assert_eq!(super::valid(" {.valid}"), 0);
+        assert_eq!(super::valid("{.class invalid}"), 0);
+        assert_eq!(super::valid("abc"), 0);
+        assert_eq!(super::valid("{.abc.}"), 0);
    }
 }
--- a/src/block.rs
+++ b/src/block.rs
@ -834,8 +834,9 @@ impl<'s> IdentifiedBlock<'s> {
                    None
                }
            }
-            '{' => (attr::valid(line.chars()) == lt)
-                .then(|| (Kind::Atom(Attributes), Span::by_len(indent, l))),
+            '{' => {
+                (attr::valid(line) == lt).then(|| (Kind::Atom(Attributes), Span::by_len(indent, l)))
+            }
            '|' => {
                if lt >= 2 && line_t.ends_with('|') && !line_t.ends_with("\\|") {
                    Some((Kind::Table { caption: false }, Span::empty_at(indent)))
@ -902,10 +903,10 @@ impl<'s> IdentifiedBlock<'s> {
                let spec =
                    &line_t[fence_length..].trim_start_matches(|c: char| c.is_ascii_whitespace());
                let valid_spec = if f == ':' {
-                    spec.chars().all(attr::is_name)
+                    spec.bytes().all(attr::is_name)
                } else {
-                    !spec.chars().any(|c| c.is_ascii_whitespace())
-                        && !spec.chars().any(|c| c == '`')
+                    !spec.bytes().any(|c| c.is_ascii_whitespace())
+                        && !spec.bytes().any(|c| c == b'`')
                };
                (valid_spec && fence_length >= 3).then(|| {
                    (