diff --git a/src/block.rs b/src/block.rs index 56e5f5f..f4b5ecb 100644 --- a/src/block.rs +++ b/src/block.rs @@ -601,7 +601,7 @@ impl<'s> TreeParser<'s> { let row_event_enter = self.enter(Node::Container(TableRow { head: false }), row.with_len(1)); let rem = row.skip(1); // | - let lex = lex::Lexer::new(rem.of(self.src)); + let lex = lex::Lexer::new(rem.of(self.src).as_bytes()); let mut pos = rem.start(); let mut cell_start = pos; let mut separator_row = true; diff --git a/src/inline.rs b/src/inline.rs index 6604293..5fa5656 100644 --- a/src/inline.rs +++ b/src/inline.rs @@ -94,7 +94,7 @@ impl<'s> Input<'s> { fn new(src: &'s str) -> Self { Self { src, - lexer: lex::Lexer::new(""), + lexer: lex::Lexer::new(b""), complete: false, span_line: Span::new(0, 0), ahead: std::collections::VecDeque::new(), @@ -118,13 +118,13 @@ impl<'s> Input<'s> { } fn set_current_line(&mut self, line: Span) { - self.lexer = lex::Lexer::new(line.of(self.src)); + self.lexer = lex::Lexer::new(line.of(self.src).as_bytes()); self.span_line = line; self.span = line.empty_before(); } fn reset(&mut self) { - self.lexer = lex::Lexer::new(""); + self.lexer = lex::Lexer::new(b""); self.complete = false; self.ahead.clear(); } @@ -154,21 +154,22 @@ impl<'s> Input<'s> { self.lexer.peek().map(|t| &t.kind), Some(lex::Kind::Open(Delimiter::BraceEqual)) ) { - let mut ahead = self.lexer.ahead().chars(); let mut end = false; - let len = (&mut ahead) + let len = self + .lexer + .ahead() + .iter() .skip(2) // {= .take_while(|c| { - if *c == '{' { + if **c == b'{' { return false; } - if *c == '}' { + if **c == b'}' { end = true; }; - !end && !c.is_whitespace() + !end && !c.is_ascii_whitespace() }) - .map(char::len_utf8) - .sum(); + .count(); (len > 0 && end).then(|| { let tok = self.eat(); debug_assert_eq!( @@ -178,7 +179,7 @@ impl<'s> Input<'s> { len: 2, }) ); - self.lexer = lex::Lexer::new(ahead.as_str()); + self.lexer.skip_ahead(len + 1); self.span.after(len) }) } else { @@ -493,7 +494,7 @@ impl<'s> Parser<'s> { if opener_eaten { self.input.span = Span::empty_at(start_attr); self.input.lexer = lex::Lexer::new( - &self.input.src[start_attr..self.input.span_line.end()], + &self.input.src.as_bytes()[start_attr..self.input.span_line.end()], ); } return Some(More); @@ -523,7 +524,7 @@ impl<'s> Parser<'s> { self.input.set_current_line(l); } self.input.span = Span::new(start_attr, state.end_attr); - self.input.lexer = lex::Lexer::new(&self.input.src[state.end_attr..line_end]); + self.input.lexer = lex::Lexer::new(&self.input.src.as_bytes()[state.end_attr..line_end]); if attrs.is_empty() { if matches!(state.elem_ty, AttributesElementType::Container { .. }) { @@ -563,26 +564,28 @@ impl<'s> Parser<'s> { fn parse_autolink(&mut self, first: &lex::Token) -> Option { if first.kind == lex::Kind::Sym(Symbol::Lt) { - let mut ahead = self.input.lexer.ahead().chars(); let mut end = false; let mut is_url = false; - let len = (&mut ahead) + let len = self + .input + .lexer + .ahead() + .iter() .take_while(|c| { - if *c == '<' { + if **c == b'<' { return false; } - if *c == '>' { + if **c == b'>' { end = true; }; - if matches!(*c, ':' | '@') { + if matches!(*c, b':' | b'@') { is_url = true; } - !end && !c.is_whitespace() + !end && !c.is_ascii_whitespace() }) - .map(char::len_utf8) - .sum(); + .count(); if end && is_url { - self.input.lexer = lex::Lexer::new(ahead.as_str()); + self.input.lexer.skip_ahead(len + 1); let span_url = self.input.span.after(len); let url = span_url.of(self.input.src); self.push(EventKind::Enter(Autolink(url))); @@ -597,22 +600,24 @@ impl<'s> Parser<'s> { fn parse_symbol(&mut self, first: &lex::Token) -> Option { if first.kind == lex::Kind::Sym(Symbol::Colon) { - let mut ahead = self.input.lexer.ahead().chars(); let mut end = false; let mut valid = true; - let len = (&mut ahead) + let len = self + .input + .lexer + .ahead() + .iter() .take_while(|c| { - if *c == ':' { + if **c == b':' { end = true; - } else if !c.is_ascii_alphanumeric() && !matches!(c, '-' | '+' | '_') { + } else if !c.is_ascii_alphanumeric() && !matches!(c, b'-' | b'+' | b'_') { valid = false; } - !end && !c.is_whitespace() + !end && !c.is_ascii_whitespace() }) - .map(char::len_utf8) - .sum(); + .count(); if end && valid { - self.input.lexer = lex::Lexer::new(ahead.as_str()); + self.input.lexer.skip_ahead(len + 1); let span_symbol = self.input.span.after(len); self.input.span = Span::new(self.input.span.start(), span_symbol.end() + 1); return self.push(EventKind::Atom(Atom::Symbol( @@ -641,22 +646,24 @@ impl<'s> Parser<'s> { len: 1, }) ); - let mut ahead = self.input.lexer.ahead().chars(); let mut end = false; - let len = (&mut ahead) + let len = self + .input + .lexer + .ahead() + .iter() .take_while(|c| { - if *c == '[' { + if **c == b'[' { return false; } - if *c == ']' { + if **c == b']' { end = true; }; - !end && *c != '\n' + !end && **c != b'\n' }) - .map(char::len_utf8) - .sum(); + .count(); if end { - self.input.lexer = lex::Lexer::new(ahead.as_str()); + self.input.lexer.skip_ahead(len + 1); let span_label = self.input.span.after(len); let label = span_label.of(self.input.src); self.input.span = Span::new(self.input.span.start(), span_label.end() + 1); @@ -837,9 +844,9 @@ impl<'s> Parser<'s> { .input .lexer .ahead() - .chars() + .iter() .next() - .map_or(true, char::is_whitespace); + .map_or(true, |c| c.is_ascii_whitespace()); if opener.bidirectional() && whitespace_after { return None; } diff --git a/src/lex.rs b/src/lex.rs index 3d3aeca..b418c51 100644 --- a/src/lex.rs +++ b/src/lex.rs @@ -71,7 +71,7 @@ impl Sequence { #[derive(Clone)] pub(crate) struct Lexer<'s> { - src: &'s str, + src: &'s [u8], /// Current position within `src`. pos: usize, /// Next character should be escaped. @@ -81,7 +81,7 @@ pub(crate) struct Lexer<'s> { } impl<'s> Lexer<'s> { - pub fn new(src: &'s str) -> Self { + pub fn new(src: &'s [u8]) -> Self { Lexer { src, pos: 0, @@ -99,10 +99,14 @@ impl<'s> Lexer<'s> { self.next.as_ref() } - pub fn ahead(&self) -> &'s str { + pub fn ahead(&self) -> &'s [u8] { &self.src[self.pos - self.next.as_ref().map_or(0, |t| t.len)..] } + pub fn skip_ahead(&mut self, n: usize) { + *self = Self::new(&self.src[self.pos + n..]); + } + fn next_token(&mut self) -> Option { let mut current = self.token(); @@ -119,7 +123,7 @@ impl<'s> Lexer<'s> { } fn peek_byte_n(&mut self, n: usize) -> Option { - self.src.as_bytes().get(self.pos + n).copied() + self.src.get(self.pos + n).copied() } fn peek_byte(&mut self) -> Option { @@ -128,7 +132,7 @@ impl<'s> Lexer<'s> { fn eat_byte(&mut self) -> Option { if self.pos < self.src.len() { - let c = self.src.as_bytes()[self.pos]; + let c = self.src[self.pos]; self.pos += 1; Some(c) } else { @@ -155,9 +159,9 @@ impl<'s> Lexer<'s> { b'\n' => Hardbreak, b'\t' | b' ' if self.src[self.pos..] - .bytes() + .iter() .find(|c| !matches!(c, b' ' | b'\t')) - == Some(b'\n') => + == Some(&b'\n') => { while self.eat_byte() != Some(b'\n') {} Hardbreak @@ -319,7 +323,7 @@ mod test { macro_rules! test_lex { ($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => { #[allow(unused)] - let actual = super::Lexer::new($src).collect::>(); + let actual = super::Lexer::new($src.as_bytes()).collect::>(); let expected = vec![$($($token),*,)?]; assert_eq!(actual, expected, "{}", $src); }; diff --git a/tests/html-ut/skip b/tests/html-ut/skip index 75b4c75..bb4d9bb 100644 --- a/tests/html-ut/skip +++ b/tests/html-ut/skip @@ -3,8 +3,6 @@ f4f22fc:attribute key class order ae6fc15:bugged left/right quote 168469a:bugged left/right quote -2056174:unicode whitespace emph -2e8fffa:unicode whitespace strong e1f5b5e:untrimmed whitespace before linebreak 07888f3:div close within raw block 8423412:heading id conflict with existing id