lex: use bytes instead of chars

All special characters are ascii (within 00-7f), so chars aren't
required. Using bytes will make it easier to use SIMD instructions.

Some places in inline.rs have required changes because there is no
longer any guarantee that the token/event spans are aligned to utf-8
chars. Calling event.span.of(src) may now cause a panic.
This commit is contained in:
Noah Hellman 2023-03-01 22:16:09 +01:00
parent 55234bf193
commit 798f8941d8
2 changed files with 137 additions and 125 deletions

View file

@ -349,8 +349,9 @@ impl<'s> Parser<'s> {
.input .input
.span .span
.of(self.input.src) .of(self.input.src)
.chars() .as_bytes()
.all(char::is_whitespace); .iter()
.all(|b| b.is_ascii_whitespace());
if is_whitespace { if is_whitespace {
if !*non_whitespace_encountered if !*non_whitespace_encountered
&& self.input.peek().map_or(false, |t| { && self.input.peek().map_or(false, |t| {
@ -683,13 +684,11 @@ impl<'s> Parser<'s> {
// empty container // empty container
return None; return None;
} }
let whitespace_before = self.events.back().map_or(false, |ev| { let whitespace_before = if 0 < self.input.span.start() {
ev.span self.input.src.as_bytes()[self.input.span.start() - 1].is_ascii_whitespace()
.of(self.input.src) } else {
.chars() false
.last() };
.map_or(false, char::is_whitespace)
});
if opener.bidirectional() && whitespace_before { if opener.bidirectional() && whitespace_before {
return None; return None;
} }
@ -748,13 +747,20 @@ impl<'s> Parser<'s> {
!matches!(ev.kind, EventKind::Str | EventKind::Atom(..)) !matches!(ev.kind, EventKind::Str | EventKind::Atom(..))
}) })
{ {
events_text let mut spec = String::new();
.filter(|ev| { let mut span = Span::new(0, 0);
matches!(ev.kind, EventKind::Str | EventKind::Atom(..)) for ev in events_text.filter(|ev| {
}) matches!(ev.kind, EventKind::Str | EventKind::Atom(..))
.map(|ev| ev.span.of(self.input.src)) }) {
.collect::<String>() if span.end() == ev.span.start() {
.into() span = Span::new(span.start(), ev.span.end());
} else {
spec.push_str(span.of(self.input.src));
span = ev.span;
}
}
spec.push_str(span.of(self.input.src));
spec.into()
} else { } else {
span_spec.of(self.input.src).into() span_spec.of(self.input.src).into()
} }
@ -837,13 +843,11 @@ impl<'s> Parser<'s> {
if opener.bidirectional() && whitespace_after { if opener.bidirectional() && whitespace_after {
return None; return None;
} }
let whitespace_before = self.events.back().map_or(false, |ev| { let whitespace_before = if 0 < self.input.span.start() {
ev.span self.input.src.as_bytes()[self.input.span.start() - 1].is_ascii_whitespace()
.of(self.input.src) } else {
.chars() false
.last() };
.map_or(false, char::is_whitespace)
});
if matches!(opener, Opener::SingleQuoted | Opener::DoubleQuoted) if matches!(opener, Opener::SingleQuoted | Opener::DoubleQuoted)
&& self && self
.events .events

View file

@ -60,11 +60,11 @@ pub enum Sequence {
} }
impl Sequence { impl Sequence {
fn ch(self) -> char { fn ch(self) -> u8 {
match self { match self {
Self::Backtick => '`', Self::Backtick => b'`',
Self::Period => '.', Self::Period => b'.',
Self::Hyphen => '-', Self::Hyphen => b'-',
} }
} }
} }
@ -72,23 +72,21 @@ impl Sequence {
#[derive(Clone)] #[derive(Clone)]
pub(crate) struct Lexer<'s> { pub(crate) struct Lexer<'s> {
src: &'s str, src: &'s str,
chars: std::str::Chars<'s>, /// Current position within `src`.
pos: usize,
/// Next character should be escaped. /// Next character should be escaped.
escape: bool, escape: bool,
/// Token to be peeked or next'ed. /// Token to be peeked or next'ed.
next: Option<Token>, next: Option<Token>,
/// Length of current token.
len: usize,
} }
impl<'s> Lexer<'s> { impl<'s> Lexer<'s> {
pub fn new(src: &'s str) -> Self { pub fn new(src: &'s str) -> Self {
Lexer { Lexer {
src, src,
chars: src.chars(), pos: 0,
escape: false, escape: false,
next: None, next: None,
len: 0,
} }
} }
@ -102,9 +100,7 @@ impl<'s> Lexer<'s> {
} }
pub fn ahead(&self) -> &'s str { pub fn ahead(&self) -> &'s str {
let pos = &self.src[self.pos - self.next.as_ref().map_or(0, |t| t.len)..]
self.src.len() - self.chars.as_str().len() - self.next.as_ref().map_or(0, |t| t.len);
&self.src[pos..]
} }
fn next_token(&mut self) -> Option<Token> { fn next_token(&mut self) -> Option<Token> {
@ -122,24 +118,28 @@ impl<'s> Lexer<'s> {
current current
} }
fn peek_char_n(&mut self, n: usize) -> Option<char> { fn peek_byte_n(&mut self, n: usize) -> Option<u8> {
self.chars.clone().nth(n) self.src.as_bytes().get(self.pos + n).copied()
} }
fn peek_char(&mut self) -> Option<char> { fn peek_byte(&mut self) -> Option<u8> {
self.peek_char_n(0) self.peek_byte_n(0)
} }
fn eat_char(&mut self) -> Option<char> { fn eat_byte(&mut self) -> Option<u8> {
let c = self.chars.next(); if self.pos < self.src.len() {
self.len += c.map_or(0, char::len_utf8); let c = self.src.as_bytes()[self.pos];
c self.pos += 1;
Some(c)
} else {
None
}
} }
fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { fn eat_while(&mut self, mut predicate: impl FnMut(u8) -> bool) {
while let Some(c) = self.peek_char() { while let Some(c) = self.peek_byte() {
if predicate(c) { if predicate(c) {
self.eat_char(); self.eat_byte();
} else { } else {
break; break;
} }
@ -147,34 +147,36 @@ impl<'s> Lexer<'s> {
} }
fn token(&mut self) -> Option<Token> { fn token(&mut self) -> Option<Token> {
self.len = 0; let start = self.pos;
let kind = if self.escape { let kind = if self.escape {
self.escape = false; self.escape = false;
match self.eat_char()? { match self.eat_byte()? {
'\n' => Hardbreak, b'\n' => Hardbreak,
'\t' | ' ' b'\t' | b' '
if self.chars.clone().find(|c| !matches!(c, ' ' | '\t')) == Some('\n') => if self.src[self.pos..]
.bytes()
.find(|c| !matches!(c, b' ' | b'\t'))
== Some(b'\n') =>
{ {
while self.eat_char() != Some('\n') {} while self.eat_byte() != Some(b'\n') {}
Hardbreak Hardbreak
} }
' ' => Nbsp, b' ' => Nbsp,
_ => Text, _ => Text,
} }
} else { } else {
self.eat_while(|c| !is_special(c)); self.eat_while(|c| !is_special(c));
if self.len > 0 { if start < self.pos {
Text Text
} else { } else {
match self.eat_char()? { match self.eat_byte()? {
'\n' => Newline, b'\n' => Newline,
'\\' => { b'\\' => {
if self if self.peek_byte().map_or(false, |c| {
.peek_char() c.is_ascii_whitespace() || c.is_ascii_punctuation()
.map_or(false, |c| c.is_whitespace() || c.is_ascii_punctuation()) }) {
{
self.escape = true; self.escape = true;
Escape Escape
} else { } else {
@ -182,62 +184,67 @@ impl<'s> Lexer<'s> {
} }
} }
'[' => Open(Bracket), b'[' => Open(Bracket),
']' => Close(Bracket), b']' => Close(Bracket),
'(' => Open(Paren), b'(' => Open(Paren),
')' => Close(Paren), b')' => Close(Paren),
'{' => { b'{' => {
let explicit = match self.peek_char() { let explicit = match self.peek_byte() {
Some('*') => Some(Open(BraceAsterisk)), Some(b'*') => Some(Open(BraceAsterisk)),
Some('^') => Some(Open(BraceCaret)), Some(b'^') => Some(Open(BraceCaret)),
Some('=') => Some(Open(BraceEqual)), Some(b'=') => Some(Open(BraceEqual)),
Some('-') => Some(Open(BraceHyphen)), Some(b'-') => Some(Open(BraceHyphen)),
Some('+') => Some(Open(BracePlus)), Some(b'+') => Some(Open(BracePlus)),
Some('~') => Some(Open(BraceTilde)), Some(b'~') => Some(Open(BraceTilde)),
Some('_') => Some(Open(BraceUnderscore)), Some(b'_') => Some(Open(BraceUnderscore)),
Some('\'') => Some(Open(BraceQuote1)), Some(b'\'') => Some(Open(BraceQuote1)),
Some('"') => Some(Open(BraceQuote2)), Some(b'"') => Some(Open(BraceQuote2)),
_ => None, _ => None,
}; };
if let Some(exp) = explicit { if let Some(exp) = explicit {
self.eat_char(); self.eat_byte();
exp exp
} else { } else {
Open(Brace) Open(Brace)
} }
} }
'}' => Close(Brace), b'}' => Close(Brace),
'*' => self.maybe_eat_close_brace(Sym(Asterisk), BraceAsterisk), b'*' => self.maybe_eat_close_brace(Sym(Asterisk), BraceAsterisk),
'^' => self.maybe_eat_close_brace(Sym(Caret), BraceCaret), b'^' => self.maybe_eat_close_brace(Sym(Caret), BraceCaret),
'=' => self.maybe_eat_close_brace(Text, BraceEqual), b'=' => self.maybe_eat_close_brace(Text, BraceEqual),
'+' => self.maybe_eat_close_brace(Text, BracePlus), b'+' => self.maybe_eat_close_brace(Text, BracePlus),
'~' => self.maybe_eat_close_brace(Sym(Tilde), BraceTilde), b'~' => self.maybe_eat_close_brace(Sym(Tilde), BraceTilde),
'_' => self.maybe_eat_close_brace(Sym(Underscore), BraceUnderscore), b'_' => self.maybe_eat_close_brace(Sym(Underscore), BraceUnderscore),
'\'' => self.maybe_eat_close_brace(Sym(Quote1), BraceQuote1), b'\'' => self.maybe_eat_close_brace(Sym(Quote1), BraceQuote1),
'"' => self.maybe_eat_close_brace(Sym(Quote2), BraceQuote2), b'"' => self.maybe_eat_close_brace(Sym(Quote2), BraceQuote2),
'-' => { b'-' => {
if self.peek_char() == Some('}') { if self.peek_byte() == Some(b'}') {
self.eat_char(); self.eat_byte();
Close(BraceHyphen) Close(BraceHyphen)
} else { } else {
while self.peek_char() == Some('-') && self.peek_char_n(1) != Some('}') while self.peek_byte() == Some(b'-')
&& self.peek_byte_n(1) != Some(b'}')
{ {
self.eat_char(); self.eat_byte();
} }
Seq(Hyphen) Seq(Hyphen)
} }
} }
'!' if self.peek_char() == Some('[') => { b'!' => {
self.eat_char(); if self.peek_byte() == Some(b'[') {
Sym(ExclaimBracket) self.eat_byte();
Sym(ExclaimBracket)
} else {
Text
}
} }
'<' => Sym(Lt), b'<' => Sym(Lt),
'|' => Sym(Pipe), b'|' => Sym(Pipe),
':' => Sym(Colon), b':' => Sym(Colon),
'`' => self.eat_seq(Backtick), b'`' => self.eat_seq(Backtick),
'.' => self.eat_seq(Period), b'.' => self.eat_seq(Period),
_ => Text, _ => Text,
} }
@ -246,7 +253,7 @@ impl<'s> Lexer<'s> {
Some(Token { Some(Token {
kind, kind,
len: self.len, len: self.pos - start,
}) })
} }
@ -256,8 +263,8 @@ impl<'s> Lexer<'s> {
} }
fn maybe_eat_close_brace(&mut self, kind: Kind, d: Delimiter) -> Kind { fn maybe_eat_close_brace(&mut self, kind: Kind, d: Delimiter) -> Kind {
if self.peek_char() == Some('}') { if self.peek_byte() == Some(b'}') {
self.eat_char(); self.eat_byte();
Close(d) Close(d)
} else { } else {
kind kind
@ -273,31 +280,32 @@ impl<'s> Iterator for Lexer<'s> {
} }
} }
fn is_special(c: char) -> bool { fn is_special(c: u8) -> bool {
matches!( matches!(
c, c,
'\\' | '[' b'\\'
| ']' | b'['
| '(' | b']'
| ')' | b'('
| '{' | b')'
| '}' | b'{'
| '*' | b'}'
| '^' | b'*'
| '=' | b'^'
| '+' | b'='
| '~' | b'+'
| '_' | b'~'
| '\'' | b'_'
| '"' | b'\''
| '-' | b'"'
| '!' | b'-'
| '<' | b'!'
| '|' | b'<'
| ':' | b'|'
| '`' | b':'
| '.' | b'`'
| '\n' | b'.'
| b'\n'
) )
} }