From 72a3378831ab8ac31c545bfce79527ee94ce6cca Mon Sep 17 00:00:00 2001
From: Noah Hellman <noah@hllmn.net>
Date: Sat, 13 May 2023 12:35:17 +0200
Subject: [PATCH] attr: use bytes instead of chars

only consider ascii whitespace
---
 src/attr.rs  | 81 ++++++++++++++++++++++++----------------------------
 src/block.rs | 11 +++----
 2 files changed, 43 insertions(+), 49 deletions(-)
diff --git a/src/attr.rs b/src/attr.rs
index 1afa494..ef0e42c 100644
--- a/src/attr.rs
+++ b/src/attr.rs
@@ -8,13 +8,13 @@ pub(crate) fn parse(src: &str) -> Attributes {
     a
 }
 
-pub fn valid<I: Iterator<Item = char>>(chars: I) -> usize {
+pub fn valid(src: &str) -> usize {
     use State::*;
 
     let mut n = 0;
     let mut state = Start;
-    for c in chars {
-        n += c.len_utf8();
+    for c in src.bytes() {
+        n += 1;
         state = state.step(c);
         match state {
             Done | Invalid => break,
@@ -256,11 +256,11 @@ impl Validator {
     /// Returns number of valid bytes parsed (0 means invalid) if finished, otherwise more input is
     /// needed.
     pub fn parse(&mut self, input: &str) -> Option<usize> {
-        let mut chars = input.chars();
-        for c in &mut chars {
+        let mut bytes = input.bytes();
+        for c in &mut bytes {
             self.state = self.state.step(c);
             match self.state {
-                State::Done => return Some(input.len() - chars.as_str().len()),
+                State::Done => return Some(input.len() - bytes.len()),
                 State::Invalid => return Some(0),
                 _ => {}
             }
@@ -297,7 +297,7 @@ impl<'s> Parser<'s> {
         let mut pos = 0;
         let mut pos_prev = 0;
 
-        for c in input.chars() {
+        for c in input.bytes() {
             let state_next = self.state.step(c);
             let st = std::mem::replace(&mut self.state, state_next);
 
@@ -318,7 +318,7 @@ impl<'s> Parser<'s> {
                 }
             };
 
-            pos += c.len_utf8();
+            pos += 1;
 
             debug_assert!(!matches!(self.state, Invalid));
 
@@ -358,40 +358,40 @@ enum State {
 }
 
 impl State {
-    fn step(self, c: char) -> State {
+    fn step(self, c: u8) -> State {
         use State::*;
 
         match self {
-            Start if c == '{' => Whitespace,
+            Start if c == b'{' => Whitespace,
             Start => Invalid,
             Whitespace => match c {
-                '}' => Done,
-                '.' => ClassFirst,
-                '#' => IdentifierFirst,
-                '%' => Comment,
+                b'}' => Done,
+                b'.' => ClassFirst,
+                b'#' => IdentifierFirst,
+                b'%' => Comment,
                 c if is_name(c) => Key,
-                c if c.is_whitespace() => Whitespace,
+                c if c.is_ascii_whitespace() => Whitespace,
                 _ => Invalid,
             },
-            Comment if c == '%' => Whitespace,
+            Comment if c == b'%' => Whitespace,
             Comment => Comment,
             ClassFirst if is_name(c) => Class,
             ClassFirst => Invalid,
             IdentifierFirst if is_name(c) => Identifier,
             IdentifierFirst => Invalid,
             s @ (Class | Identifier | Value) if is_name(c) => s,
-            Class | Identifier | Value if c.is_whitespace() => Whitespace,
-            Class | Identifier | Value if c == '}' => Done,
+            Class | Identifier | Value if c.is_ascii_whitespace() => Whitespace,
+            Class | Identifier | Value if c == b'}' => Done,
             Class | Identifier | Value => Invalid,
             Key if is_name(c) => Key,
-            Key if c == '=' => ValueFirst,
+            Key if c == b'=' => ValueFirst,
             Key => Invalid,
             ValueFirst if is_name(c) => Value,
-            ValueFirst if c == '"' => ValueQuoted,
+            ValueFirst if c == b'"' => ValueQuoted,
             ValueFirst => Invalid,
-            ValueQuoted | ValueNewline | ValueContinued if c == '"' => Whitespace,
-            ValueQuoted | ValueNewline | ValueContinued | ValueEscape if c == '\n' => ValueNewline,
-            ValueQuoted if c == '\\' => ValueEscape,
+            ValueQuoted | ValueNewline | ValueContinued if c == b'"' => Whitespace,
+            ValueQuoted | ValueNewline | ValueContinued | ValueEscape if c == b'\n' => ValueNewline,
+            ValueQuoted if c == b'\\' => ValueEscape,
             ValueQuoted | ValueEscape => ValueQuoted,
             ValueNewline | ValueContinued => ValueContinued,
             Invalid | Done => panic!("{:?}", self),
@@ -399,8 +399,8 @@ impl State {
     }
 }
 
-pub fn is_name(c: char) -> bool {
-    c.is_ascii_alphanumeric() || matches!(c, ':' | '_' | '-')
+pub fn is_name(c: u8) -> bool {
+    c.is_ascii_alphanumeric() || matches!(c, b':' | b'_' | b'-')
 }
 
 #[cfg(test)]
@@ -435,11 +435,6 @@ mod test {
         test_attr!("{#a #b}", ("id", "b"));
     }
 
-    #[test]
-    fn unicode_whitespace() {
-        test_attr!("{.a .b}", ("class", "a b"));
-    }
-
     #[test]
     fn value_unquoted() {
         test_attr!(
@@ -517,47 +512,45 @@ mod test {
     #[test]
     fn valid_full() {
         let src = "{.class %comment%}";
-        assert_eq!(super::valid(src.chars()), src.len());
+        assert_eq!(super::valid(src), src.len());
     }
 
     #[test]
     fn valid_unicode() {
         let src = r#"{a="б"}"#;
-        assert_eq!(super::valid(src.chars()), src.len());
+        assert_eq!(super::valid(src), src.len());
     }
 
     #[test]
     fn valid_empty() {
         let src = "{}";
-        assert_eq!(super::valid(src.chars()), src.len());
+        assert_eq!(super::valid(src), src.len());
     }
 
     #[test]
     fn valid_whitespace() {
         let src = "{ \n }";
-        assert_eq!(super::valid(src.chars()), src.len());
+        assert_eq!(super::valid(src), src.len());
     }
 
     #[test]
     fn valid_comment() {
         let src = "{%comment%}";
-        assert_eq!(super::valid(src.chars()), src.len());
+        assert_eq!(super::valid(src), src.len());
     }
 
     #[test]
     fn valid_trailing() {
-        let src = "{.class}";
-        assert_eq!(
-            super::valid(src.chars().chain("{.ignore}".chars())),
-            src.len(),
-        );
+        let src = "{.class}{.ignore}";
+        let src_valid = "{.class}";
+        assert_eq!(super::valid(src), src_valid.len());
     }
 
     #[test]
     fn valid_invalid() {
-        assert_eq!(super::valid(" {.valid}".chars()), 0);
-        assert_eq!(super::valid("{.class invalid}".chars()), 0);
-        assert_eq!(super::valid("abc".chars()), 0);
-        assert_eq!(super::valid("{.abc.}".chars()), 0);
+        assert_eq!(super::valid(" {.valid}"), 0);
+        assert_eq!(super::valid("{.class invalid}"), 0);
+        assert_eq!(super::valid("abc"), 0);
+        assert_eq!(super::valid("{.abc.}"), 0);
     }
 }
diff --git a/src/block.rs b/src/block.rs
index 077a6c9..56e5f5f 100644
--- a/src/block.rs
+++ b/src/block.rs
@@ -834,8 +834,9 @@ impl<'s> IdentifiedBlock<'s> {
                     None
                 }
             }
-            '{' => (attr::valid(line.chars()) == lt)
-                .then(|| (Kind::Atom(Attributes), Span::by_len(indent, l))),
+            '{' => {
+                (attr::valid(line) == lt).then(|| (Kind::Atom(Attributes), Span::by_len(indent, l)))
+            }
             '|' => {
                 if lt >= 2 && line_t.ends_with('|') && !line_t.ends_with("\\|") {
                     Some((Kind::Table { caption: false }, Span::empty_at(indent)))
@@ -902,10 +903,10 @@ impl<'s> IdentifiedBlock<'s> {
                 let spec =
                     &line_t[fence_length..].trim_start_matches(|c: char| c.is_ascii_whitespace());
                 let valid_spec = if f == ':' {
-                    spec.chars().all(attr::is_name)
+                    spec.bytes().all(attr::is_name)
                 } else {
-                    !spec.chars().any(|c| c.is_ascii_whitespace())
-                        && !spec.chars().any(|c| c == '`')
+                    !spec.bytes().any(|c| c.is_ascii_whitespace())
+                        && !spec.bytes().any(|c| c == b'`')
                 };
                 (valid_spec && fence_length >= 3).then(|| {
                     (