inline: use bytes instead of chars

This commit is contained in:
Noah Hellman 2023-05-13 20:49:40 +02:00
parent eaa21fd393
commit 3908823d18
4 changed files with 60 additions and 51 deletions

View file

@ -601,7 +601,7 @@ impl<'s> TreeParser<'s> {
let row_event_enter = let row_event_enter =
self.enter(Node::Container(TableRow { head: false }), row.with_len(1)); self.enter(Node::Container(TableRow { head: false }), row.with_len(1));
let rem = row.skip(1); // | let rem = row.skip(1); // |
let lex = lex::Lexer::new(rem.of(self.src)); let lex = lex::Lexer::new(rem.of(self.src).as_bytes());
let mut pos = rem.start(); let mut pos = rem.start();
let mut cell_start = pos; let mut cell_start = pos;
let mut separator_row = true; let mut separator_row = true;

View file

@ -94,7 +94,7 @@ impl<'s> Input<'s> {
fn new(src: &'s str) -> Self { fn new(src: &'s str) -> Self {
Self { Self {
src, src,
lexer: lex::Lexer::new(""), lexer: lex::Lexer::new(b""),
complete: false, complete: false,
span_line: Span::new(0, 0), span_line: Span::new(0, 0),
ahead: std::collections::VecDeque::new(), ahead: std::collections::VecDeque::new(),
@ -118,13 +118,13 @@ impl<'s> Input<'s> {
} }
fn set_current_line(&mut self, line: Span) { fn set_current_line(&mut self, line: Span) {
self.lexer = lex::Lexer::new(line.of(self.src)); self.lexer = lex::Lexer::new(line.of(self.src).as_bytes());
self.span_line = line; self.span_line = line;
self.span = line.empty_before(); self.span = line.empty_before();
} }
fn reset(&mut self) { fn reset(&mut self) {
self.lexer = lex::Lexer::new(""); self.lexer = lex::Lexer::new(b"");
self.complete = false; self.complete = false;
self.ahead.clear(); self.ahead.clear();
} }
@ -154,21 +154,22 @@ impl<'s> Input<'s> {
self.lexer.peek().map(|t| &t.kind), self.lexer.peek().map(|t| &t.kind),
Some(lex::Kind::Open(Delimiter::BraceEqual)) Some(lex::Kind::Open(Delimiter::BraceEqual))
) { ) {
let mut ahead = self.lexer.ahead().chars();
let mut end = false; let mut end = false;
let len = (&mut ahead) let len = self
.lexer
.ahead()
.iter()
.skip(2) // {= .skip(2) // {=
.take_while(|c| { .take_while(|c| {
if *c == '{' { if **c == b'{' {
return false; return false;
} }
if *c == '}' { if **c == b'}' {
end = true; end = true;
}; };
!end && !c.is_whitespace() !end && !c.is_ascii_whitespace()
}) })
.map(char::len_utf8) .count();
.sum();
(len > 0 && end).then(|| { (len > 0 && end).then(|| {
let tok = self.eat(); let tok = self.eat();
debug_assert_eq!( debug_assert_eq!(
@ -178,7 +179,7 @@ impl<'s> Input<'s> {
len: 2, len: 2,
}) })
); );
self.lexer = lex::Lexer::new(ahead.as_str()); self.lexer.skip_ahead(len + 1);
self.span.after(len) self.span.after(len)
}) })
} else { } else {
@ -493,7 +494,7 @@ impl<'s> Parser<'s> {
if opener_eaten { if opener_eaten {
self.input.span = Span::empty_at(start_attr); self.input.span = Span::empty_at(start_attr);
self.input.lexer = lex::Lexer::new( self.input.lexer = lex::Lexer::new(
&self.input.src[start_attr..self.input.span_line.end()], &self.input.src.as_bytes()[start_attr..self.input.span_line.end()],
); );
} }
return Some(More); return Some(More);
@ -523,7 +524,7 @@ impl<'s> Parser<'s> {
self.input.set_current_line(l); self.input.set_current_line(l);
} }
self.input.span = Span::new(start_attr, state.end_attr); self.input.span = Span::new(start_attr, state.end_attr);
self.input.lexer = lex::Lexer::new(&self.input.src[state.end_attr..line_end]); self.input.lexer = lex::Lexer::new(&self.input.src.as_bytes()[state.end_attr..line_end]);
if attrs.is_empty() { if attrs.is_empty() {
if matches!(state.elem_ty, AttributesElementType::Container { .. }) { if matches!(state.elem_ty, AttributesElementType::Container { .. }) {
@ -563,26 +564,28 @@ impl<'s> Parser<'s> {
fn parse_autolink(&mut self, first: &lex::Token) -> Option<ControlFlow> { fn parse_autolink(&mut self, first: &lex::Token) -> Option<ControlFlow> {
if first.kind == lex::Kind::Sym(Symbol::Lt) { if first.kind == lex::Kind::Sym(Symbol::Lt) {
let mut ahead = self.input.lexer.ahead().chars();
let mut end = false; let mut end = false;
let mut is_url = false; let mut is_url = false;
let len = (&mut ahead) let len = self
.input
.lexer
.ahead()
.iter()
.take_while(|c| { .take_while(|c| {
if *c == '<' { if **c == b'<' {
return false; return false;
} }
if *c == '>' { if **c == b'>' {
end = true; end = true;
}; };
if matches!(*c, ':' | '@') { if matches!(*c, b':' | b'@') {
is_url = true; is_url = true;
} }
!end && !c.is_whitespace() !end && !c.is_ascii_whitespace()
}) })
.map(char::len_utf8) .count();
.sum();
if end && is_url { if end && is_url {
self.input.lexer = lex::Lexer::new(ahead.as_str()); self.input.lexer.skip_ahead(len + 1);
let span_url = self.input.span.after(len); let span_url = self.input.span.after(len);
let url = span_url.of(self.input.src); let url = span_url.of(self.input.src);
self.push(EventKind::Enter(Autolink(url))); self.push(EventKind::Enter(Autolink(url)));
@ -597,22 +600,24 @@ impl<'s> Parser<'s> {
fn parse_symbol(&mut self, first: &lex::Token) -> Option<ControlFlow> { fn parse_symbol(&mut self, first: &lex::Token) -> Option<ControlFlow> {
if first.kind == lex::Kind::Sym(Symbol::Colon) { if first.kind == lex::Kind::Sym(Symbol::Colon) {
let mut ahead = self.input.lexer.ahead().chars();
let mut end = false; let mut end = false;
let mut valid = true; let mut valid = true;
let len = (&mut ahead) let len = self
.input
.lexer
.ahead()
.iter()
.take_while(|c| { .take_while(|c| {
if *c == ':' { if **c == b':' {
end = true; end = true;
} else if !c.is_ascii_alphanumeric() && !matches!(c, '-' | '+' | '_') { } else if !c.is_ascii_alphanumeric() && !matches!(c, b'-' | b'+' | b'_') {
valid = false; valid = false;
} }
!end && !c.is_whitespace() !end && !c.is_ascii_whitespace()
}) })
.map(char::len_utf8) .count();
.sum();
if end && valid { if end && valid {
self.input.lexer = lex::Lexer::new(ahead.as_str()); self.input.lexer.skip_ahead(len + 1);
let span_symbol = self.input.span.after(len); let span_symbol = self.input.span.after(len);
self.input.span = Span::new(self.input.span.start(), span_symbol.end() + 1); self.input.span = Span::new(self.input.span.start(), span_symbol.end() + 1);
return self.push(EventKind::Atom(Atom::Symbol( return self.push(EventKind::Atom(Atom::Symbol(
@ -641,22 +646,24 @@ impl<'s> Parser<'s> {
len: 1, len: 1,
}) })
); );
let mut ahead = self.input.lexer.ahead().chars();
let mut end = false; let mut end = false;
let len = (&mut ahead) let len = self
.input
.lexer
.ahead()
.iter()
.take_while(|c| { .take_while(|c| {
if *c == '[' { if **c == b'[' {
return false; return false;
} }
if *c == ']' { if **c == b']' {
end = true; end = true;
}; };
!end && *c != '\n' !end && **c != b'\n'
}) })
.map(char::len_utf8) .count();
.sum();
if end { if end {
self.input.lexer = lex::Lexer::new(ahead.as_str()); self.input.lexer.skip_ahead(len + 1);
let span_label = self.input.span.after(len); let span_label = self.input.span.after(len);
let label = span_label.of(self.input.src); let label = span_label.of(self.input.src);
self.input.span = Span::new(self.input.span.start(), span_label.end() + 1); self.input.span = Span::new(self.input.span.start(), span_label.end() + 1);
@ -837,9 +844,9 @@ impl<'s> Parser<'s> {
.input .input
.lexer .lexer
.ahead() .ahead()
.chars() .iter()
.next() .next()
.map_or(true, char::is_whitespace); .map_or(true, |c| c.is_ascii_whitespace());
if opener.bidirectional() && whitespace_after { if opener.bidirectional() && whitespace_after {
return None; return None;
} }

View file

@ -71,7 +71,7 @@ impl Sequence {
#[derive(Clone)] #[derive(Clone)]
pub(crate) struct Lexer<'s> { pub(crate) struct Lexer<'s> {
src: &'s str, src: &'s [u8],
/// Current position within `src`. /// Current position within `src`.
pos: usize, pos: usize,
/// Next character should be escaped. /// Next character should be escaped.
@ -81,7 +81,7 @@ pub(crate) struct Lexer<'s> {
} }
impl<'s> Lexer<'s> { impl<'s> Lexer<'s> {
pub fn new(src: &'s str) -> Self { pub fn new(src: &'s [u8]) -> Self {
Lexer { Lexer {
src, src,
pos: 0, pos: 0,
@ -99,10 +99,14 @@ impl<'s> Lexer<'s> {
self.next.as_ref() self.next.as_ref()
} }
pub fn ahead(&self) -> &'s str { pub fn ahead(&self) -> &'s [u8] {
&self.src[self.pos - self.next.as_ref().map_or(0, |t| t.len)..] &self.src[self.pos - self.next.as_ref().map_or(0, |t| t.len)..]
} }
pub fn skip_ahead(&mut self, n: usize) {
*self = Self::new(&self.src[self.pos + n..]);
}
fn next_token(&mut self) -> Option<Token> { fn next_token(&mut self) -> Option<Token> {
let mut current = self.token(); let mut current = self.token();
@ -119,7 +123,7 @@ impl<'s> Lexer<'s> {
} }
fn peek_byte_n(&mut self, n: usize) -> Option<u8> { fn peek_byte_n(&mut self, n: usize) -> Option<u8> {
self.src.as_bytes().get(self.pos + n).copied() self.src.get(self.pos + n).copied()
} }
fn peek_byte(&mut self) -> Option<u8> { fn peek_byte(&mut self) -> Option<u8> {
@ -128,7 +132,7 @@ impl<'s> Lexer<'s> {
fn eat_byte(&mut self) -> Option<u8> { fn eat_byte(&mut self) -> Option<u8> {
if self.pos < self.src.len() { if self.pos < self.src.len() {
let c = self.src.as_bytes()[self.pos]; let c = self.src[self.pos];
self.pos += 1; self.pos += 1;
Some(c) Some(c)
} else { } else {
@ -155,9 +159,9 @@ impl<'s> Lexer<'s> {
b'\n' => Hardbreak, b'\n' => Hardbreak,
b'\t' | b' ' b'\t' | b' '
if self.src[self.pos..] if self.src[self.pos..]
.bytes() .iter()
.find(|c| !matches!(c, b' ' | b'\t')) .find(|c| !matches!(c, b' ' | b'\t'))
== Some(b'\n') => == Some(&b'\n') =>
{ {
while self.eat_byte() != Some(b'\n') {} while self.eat_byte() != Some(b'\n') {}
Hardbreak Hardbreak
@ -319,7 +323,7 @@ mod test {
macro_rules! test_lex { macro_rules! test_lex {
($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => { ($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => {
#[allow(unused)] #[allow(unused)]
let actual = super::Lexer::new($src).collect::<Vec<_>>(); let actual = super::Lexer::new($src.as_bytes()).collect::<Vec<_>>();
let expected = vec![$($($token),*,)?]; let expected = vec![$($($token),*,)?];
assert_eq!(actual, expected, "{}", $src); assert_eq!(actual, expected, "{}", $src);
}; };

View file

@ -3,8 +3,6 @@
f4f22fc:attribute key class order f4f22fc:attribute key class order
ae6fc15:bugged left/right quote ae6fc15:bugged left/right quote
168469a:bugged left/right quote 168469a:bugged left/right quote
2056174:unicode whitespace emph
2e8fffa:unicode whitespace strong
e1f5b5e:untrimmed whitespace before linebreak e1f5b5e:untrimmed whitespace before linebreak
07888f3:div close within raw block 07888f3:div close within raw block
8423412:heading id conflict with existing id 8423412:heading id conflict with existing id