2022-11-20 13:13:48 -05:00
|
|
|
use crate::EOF;
|
|
|
|
|
|
|
|
use Delimiter::*;
|
2022-11-21 13:56:11 -05:00
|
|
|
use Kind::*;
|
2022-11-20 13:13:48 -05:00
|
|
|
use Sequence::*;
|
|
|
|
use Symbol::*;
|
|
|
|
|
2022-11-22 13:19:21 -05:00
|
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
2022-11-20 13:13:48 -05:00
|
|
|
pub(crate) struct Token {
|
2022-11-21 13:44:59 -05:00
|
|
|
pub kind: Kind,
|
2022-11-20 13:13:48 -05:00
|
|
|
pub len: usize,
|
|
|
|
}
|
|
|
|
|
2022-12-24 05:18:15 -05:00
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
2022-11-21 13:44:59 -05:00
|
|
|
pub enum Kind {
|
2022-11-20 13:13:48 -05:00
|
|
|
Text,
|
2022-12-01 12:09:09 -05:00
|
|
|
Newline,
|
2022-11-20 13:13:48 -05:00
|
|
|
Whitespace,
|
|
|
|
Nbsp,
|
2022-12-01 12:09:09 -05:00
|
|
|
Hardbreak,
|
2022-11-20 13:13:48 -05:00
|
|
|
Escape,
|
|
|
|
Open(Delimiter),
|
|
|
|
Close(Delimiter),
|
|
|
|
Sym(Symbol),
|
|
|
|
Seq(Sequence),
|
|
|
|
}
|
|
|
|
|
2022-11-22 13:19:21 -05:00
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
2022-11-20 13:13:48 -05:00
|
|
|
pub enum Delimiter {
|
|
|
|
Brace,
|
|
|
|
BraceAsterisk,
|
|
|
|
BraceCaret,
|
2022-11-27 17:56:19 -05:00
|
|
|
BraceEqual,
|
2022-11-20 13:13:48 -05:00
|
|
|
BraceHyphen,
|
|
|
|
BracePlus,
|
|
|
|
BraceTilde,
|
|
|
|
BraceUnderscore,
|
|
|
|
Bracket,
|
2023-01-27 13:04:01 -05:00
|
|
|
BraceQuote1,
|
|
|
|
BraceQuote2,
|
2022-11-20 13:13:48 -05:00
|
|
|
}
|
|
|
|
|
2022-11-22 13:19:21 -05:00
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
2022-11-20 13:13:48 -05:00
|
|
|
pub enum Symbol {
|
|
|
|
Asterisk,
|
|
|
|
Caret,
|
2022-12-17 12:03:06 -05:00
|
|
|
ExclaimBracket,
|
2022-11-20 13:13:48 -05:00
|
|
|
Lt,
|
|
|
|
Pipe,
|
|
|
|
Quote1,
|
|
|
|
Quote2,
|
|
|
|
Tilde,
|
|
|
|
Underscore,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
|
|
pub enum Sequence {
|
|
|
|
Backtick,
|
2022-11-21 13:44:59 -05:00
|
|
|
Dollar,
|
2022-11-20 13:13:48 -05:00
|
|
|
Hyphen,
|
|
|
|
Period,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Sequence {
|
|
|
|
fn ch(self) -> char {
|
|
|
|
match self {
|
|
|
|
Self::Backtick => '`',
|
2022-11-21 13:44:59 -05:00
|
|
|
Self::Dollar => '$',
|
2022-11-20 13:13:48 -05:00
|
|
|
Self::Period => '.',
|
|
|
|
Self::Hyphen => '-',
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-11-22 13:19:21 -05:00
|
|
|
#[derive(Clone)]
|
2023-01-17 16:36:10 -05:00
|
|
|
pub(crate) struct Lexer<I: Iterator + Clone> {
|
2022-12-11 12:47:00 -05:00
|
|
|
chars: I,
|
2023-01-17 16:36:10 -05:00
|
|
|
chars_non_peeked: I,
|
2022-12-11 04:45:05 -05:00
|
|
|
/// Next character should be escaped.
|
2022-11-20 13:13:48 -05:00
|
|
|
escape: bool,
|
2022-12-11 04:45:05 -05:00
|
|
|
/// Token to be peeked or next'ed.
|
2022-11-20 13:13:48 -05:00
|
|
|
next: Option<Token>,
|
2022-12-11 04:45:05 -05:00
|
|
|
/// Length of current token.
|
2022-11-21 17:32:28 -05:00
|
|
|
len: usize,
|
2022-11-20 13:13:48 -05:00
|
|
|
}
|
|
|
|
|
2022-12-11 12:47:00 -05:00
|
|
|
impl<I: Iterator<Item = char> + Clone> Lexer<I> {
|
|
|
|
pub fn new(chars: I) -> Lexer<I> {
|
2022-11-20 13:13:48 -05:00
|
|
|
Lexer {
|
2023-01-17 16:36:10 -05:00
|
|
|
chars: chars.clone(),
|
|
|
|
chars_non_peeked: chars,
|
2022-11-20 13:13:48 -05:00
|
|
|
escape: false,
|
|
|
|
next: None,
|
2022-11-21 17:32:28 -05:00
|
|
|
len: 0,
|
2022-11-21 16:40:11 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-01-17 16:36:10 -05:00
|
|
|
/// NOTE: Peeked [`Kind::Text`] tokens are only one char long, they may be longer when
|
|
|
|
/// consumed.
|
2022-12-11 03:25:35 -05:00
|
|
|
pub fn peek(&mut self) -> Option<&Token> {
|
|
|
|
if self.next.is_none() {
|
2023-01-17 16:36:10 -05:00
|
|
|
self.next = self.token();
|
2022-12-11 03:25:35 -05:00
|
|
|
}
|
|
|
|
self.next.as_ref()
|
|
|
|
}
|
|
|
|
|
2023-01-17 16:36:10 -05:00
|
|
|
pub fn chars(&self) -> I {
|
|
|
|
self.chars_non_peeked.clone()
|
2022-12-11 14:49:57 -05:00
|
|
|
}
|
|
|
|
|
2022-12-11 03:37:57 -05:00
|
|
|
fn next_token(&mut self) -> Option<Token> {
|
|
|
|
let mut current = self.token();
|
2023-01-17 16:36:10 -05:00
|
|
|
self.chars_non_peeked = self.chars.clone();
|
2022-12-11 03:37:57 -05:00
|
|
|
|
|
|
|
// concatenate text tokens
|
|
|
|
if let Some(Token { kind: Text, len }) = &mut current {
|
|
|
|
self.next = self.token();
|
|
|
|
while let Some(Token { kind: Text, len: l }) = self.next {
|
|
|
|
*len += l;
|
|
|
|
self.next = self.token();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
current
|
|
|
|
}
|
|
|
|
|
2023-01-19 16:50:37 -05:00
|
|
|
fn peek_char_n(&mut self, n: usize) -> char {
|
|
|
|
self.chars.clone().nth(n).unwrap_or(EOF)
|
|
|
|
}
|
|
|
|
|
2022-12-11 03:25:35 -05:00
|
|
|
fn peek_char(&mut self) -> char {
|
2023-01-19 16:50:37 -05:00
|
|
|
self.peek_char_n(0)
|
2022-11-20 13:13:48 -05:00
|
|
|
}
|
|
|
|
|
2022-12-11 03:25:35 -05:00
|
|
|
fn eat_char(&mut self) -> Option<char> {
|
2022-11-21 17:32:28 -05:00
|
|
|
let c = self.chars.next();
|
|
|
|
self.len += c.map_or(0, char::len_utf8);
|
|
|
|
c
|
2022-11-20 13:13:48 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
|
2022-12-11 03:25:35 -05:00
|
|
|
while predicate(self.peek_char()) {
|
|
|
|
self.eat_char();
|
2022-11-20 13:13:48 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-11-21 17:32:28 -05:00
|
|
|
fn token(&mut self) -> Option<Token> {
|
2023-01-17 16:36:10 -05:00
|
|
|
self.chars_non_peeked = self.chars.clone();
|
2022-11-21 17:32:28 -05:00
|
|
|
self.len = 0;
|
|
|
|
|
2022-12-11 03:25:35 -05:00
|
|
|
let first = self.eat_char()?;
|
2022-11-20 13:13:48 -05:00
|
|
|
|
|
|
|
let escape = self.escape;
|
|
|
|
|
|
|
|
let kind = match first {
|
2022-12-01 12:09:09 -05:00
|
|
|
_ if escape && first == '\n' => Hardbreak,
|
2022-12-02 02:16:47 -05:00
|
|
|
_ if escape
|
|
|
|
&& matches!(first, '\t' | ' ')
|
|
|
|
&& self.chars.clone().find(|c| !matches!(c, ' ' | '\t')) == Some('\n') =>
|
|
|
|
{
|
2022-12-11 03:25:35 -05:00
|
|
|
while self.eat_char() != Some('\n') {}
|
2022-12-02 02:16:47 -05:00
|
|
|
Hardbreak
|
|
|
|
}
|
|
|
|
_ if escape && first == ' ' => Nbsp,
|
2022-11-20 13:13:48 -05:00
|
|
|
_ if escape => Text,
|
|
|
|
|
|
|
|
'\\' => {
|
2022-12-11 03:25:35 -05:00
|
|
|
let next = self.peek_char();
|
2022-12-02 02:16:47 -05:00
|
|
|
if next.is_whitespace() || next.is_ascii_punctuation() {
|
2022-11-20 13:13:48 -05:00
|
|
|
self.escape = true;
|
|
|
|
Escape
|
|
|
|
} else {
|
|
|
|
Text
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-12-01 12:09:09 -05:00
|
|
|
'\n' => Newline,
|
2022-11-20 13:13:48 -05:00
|
|
|
_ if first.is_whitespace() => {
|
|
|
|
self.eat_while(char::is_whitespace);
|
|
|
|
Whitespace
|
|
|
|
}
|
|
|
|
|
|
|
|
'[' => Open(Bracket),
|
|
|
|
']' => Close(Bracket),
|
|
|
|
'{' => {
|
2022-12-11 03:25:35 -05:00
|
|
|
let explicit = match self.peek_char() {
|
2022-11-20 13:13:48 -05:00
|
|
|
'*' => Some(Open(BraceAsterisk)),
|
|
|
|
'^' => Some(Open(BraceCaret)),
|
2022-11-27 17:56:19 -05:00
|
|
|
'=' => Some(Open(BraceEqual)),
|
2022-11-20 13:13:48 -05:00
|
|
|
'-' => Some(Open(BraceHyphen)),
|
|
|
|
'+' => Some(Open(BracePlus)),
|
|
|
|
'~' => Some(Open(BraceTilde)),
|
|
|
|
'_' => Some(Open(BraceUnderscore)),
|
2023-01-27 13:04:01 -05:00
|
|
|
'\'' => Some(Open(BraceQuote1)),
|
|
|
|
'"' => Some(Open(BraceQuote2)),
|
2022-11-20 13:13:48 -05:00
|
|
|
_ => None,
|
|
|
|
};
|
|
|
|
if let Some(exp) = explicit {
|
2022-12-11 03:25:35 -05:00
|
|
|
self.eat_char();
|
2022-11-20 13:13:48 -05:00
|
|
|
exp
|
|
|
|
} else {
|
|
|
|
Open(Brace)
|
|
|
|
}
|
|
|
|
}
|
2023-01-27 11:11:39 -05:00
|
|
|
'*' => self.maybe_eat_close_brace(Sym(Asterisk), BraceAsterisk),
|
|
|
|
'^' => self.maybe_eat_close_brace(Sym(Caret), BraceCaret),
|
2023-01-27 11:05:45 -05:00
|
|
|
'=' => self.maybe_eat_close_brace(Text, BraceEqual),
|
2023-01-27 11:13:07 -05:00
|
|
|
'+' => self.maybe_eat_close_brace(Text, BracePlus),
|
2023-01-27 11:11:39 -05:00
|
|
|
'~' => self.maybe_eat_close_brace(Sym(Tilde), BraceTilde),
|
|
|
|
'_' => self.maybe_eat_close_brace(Sym(Underscore), BraceUnderscore),
|
2023-01-27 13:04:01 -05:00
|
|
|
'\'' => self.maybe_eat_close_brace(Sym(Quote1), BraceQuote1),
|
|
|
|
'"' => self.maybe_eat_close_brace(Sym(Quote2), BraceQuote2),
|
2022-11-20 13:13:48 -05:00
|
|
|
'-' => {
|
2022-12-11 03:25:35 -05:00
|
|
|
if self.peek_char() == '}' {
|
|
|
|
self.eat_char();
|
2022-11-20 13:13:48 -05:00
|
|
|
Close(BraceHyphen)
|
|
|
|
} else {
|
2023-01-19 16:50:37 -05:00
|
|
|
while self.peek_char() == '-' && self.peek_char_n(1) != '}' {
|
|
|
|
self.eat_char();
|
|
|
|
}
|
|
|
|
Seq(Hyphen)
|
2022-11-20 13:13:48 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-12-17 12:03:06 -05:00
|
|
|
'!' if self.peek_char() == '[' => {
|
|
|
|
self.eat_char();
|
|
|
|
Sym(ExclaimBracket)
|
|
|
|
}
|
2022-11-20 13:13:48 -05:00
|
|
|
'<' => Sym(Lt),
|
|
|
|
'|' => Sym(Pipe),
|
|
|
|
|
|
|
|
'`' => self.eat_seq(Backtick),
|
2022-11-21 13:44:59 -05:00
|
|
|
'$' => self.eat_seq(Dollar),
|
2022-11-20 13:13:48 -05:00
|
|
|
'.' => self.eat_seq(Period),
|
|
|
|
|
|
|
|
_ => Text,
|
|
|
|
};
|
|
|
|
|
|
|
|
if escape {
|
|
|
|
self.escape = false;
|
|
|
|
}
|
|
|
|
|
2022-11-21 17:32:28 -05:00
|
|
|
Some(Token {
|
|
|
|
kind,
|
|
|
|
len: self.len,
|
|
|
|
})
|
2022-11-20 13:13:48 -05:00
|
|
|
}
|
|
|
|
|
2022-11-21 13:44:59 -05:00
|
|
|
fn eat_seq(&mut self, s: Sequence) -> Kind {
|
2022-11-20 13:13:48 -05:00
|
|
|
self.eat_while(|c| c == s.ch());
|
|
|
|
Seq(s)
|
|
|
|
}
|
|
|
|
|
2023-01-27 11:11:39 -05:00
|
|
|
fn maybe_eat_close_brace(&mut self, kind: Kind, d: Delimiter) -> Kind {
|
2022-12-11 03:25:35 -05:00
|
|
|
if self.peek_char() == '}' {
|
|
|
|
self.eat_char();
|
2022-11-20 13:13:48 -05:00
|
|
|
Close(d)
|
|
|
|
} else {
|
2023-01-27 11:11:39 -05:00
|
|
|
kind
|
2022-11-20 13:13:48 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-12-11 12:47:00 -05:00
|
|
|
impl<I: Iterator<Item = char> + Clone> Iterator for Lexer<I> {
|
2022-11-21 17:32:28 -05:00
|
|
|
type Item = Token;
|
|
|
|
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
2023-01-17 16:36:10 -05:00
|
|
|
self.next
|
|
|
|
.take()
|
|
|
|
.map(|x| {
|
|
|
|
self.chars_non_peeked = self.chars.clone();
|
|
|
|
x
|
|
|
|
})
|
|
|
|
.or_else(|| self.next_token())
|
2022-11-21 17:32:28 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-11-20 13:13:48 -05:00
|
|
|
#[cfg(test)]
|
|
|
|
mod test {
|
|
|
|
use super::Delimiter::*;
|
2022-11-21 13:56:11 -05:00
|
|
|
use super::Kind::*;
|
2022-11-20 13:13:48 -05:00
|
|
|
use super::Sequence::*;
|
|
|
|
use super::Symbol::*;
|
|
|
|
|
|
|
|
macro_rules! test_lex {
|
|
|
|
($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => {
|
|
|
|
#[allow(unused)]
|
2022-12-11 12:47:00 -05:00
|
|
|
let actual = super::Lexer::new($src.chars()).collect::<Vec<_>>();
|
2022-11-20 13:13:48 -05:00
|
|
|
let expected = vec![$($($token),*,)?];
|
|
|
|
assert_eq!(actual, expected, "{}", $src);
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2022-11-21 17:32:28 -05:00
|
|
|
impl super::Kind {
|
|
|
|
fn l(self, len: usize) -> super::Token {
|
|
|
|
super::Token { kind: self, len }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-11-20 13:13:48 -05:00
|
|
|
#[test]
|
|
|
|
fn empty() {
|
|
|
|
test_lex!("");
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn basic() {
|
2022-11-21 17:32:28 -05:00
|
|
|
test_lex!("abc", Text.l(3));
|
2022-11-20 13:13:48 -05:00
|
|
|
test_lex!(
|
|
|
|
"para w/ some _emphasis_ and *strong*.",
|
2022-11-21 17:32:28 -05:00
|
|
|
Text.l(4),
|
|
|
|
Whitespace.l(1),
|
|
|
|
Text.l(2),
|
|
|
|
Whitespace.l(1),
|
|
|
|
Text.l(4),
|
|
|
|
Whitespace.l(1),
|
|
|
|
Sym(Underscore).l(1),
|
|
|
|
Text.l(8),
|
|
|
|
Sym(Underscore).l(1),
|
|
|
|
Whitespace.l(1),
|
|
|
|
Text.l(3),
|
|
|
|
Whitespace.l(1),
|
|
|
|
Sym(Asterisk).l(1),
|
|
|
|
Text.l(6),
|
|
|
|
Sym(Asterisk).l(1),
|
|
|
|
Seq(Period).l(1),
|
2022-11-20 13:13:48 -05:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn escape() {
|
2022-11-21 17:32:28 -05:00
|
|
|
test_lex!(r#"\a"#, Text.l(2));
|
|
|
|
test_lex!(r#"\\a"#, Escape.l(1), Text.l(2));
|
|
|
|
test_lex!(r#"\."#, Escape.l(1), Text.l(1));
|
|
|
|
test_lex!(r#"\ "#, Escape.l(1), Nbsp.l(1));
|
|
|
|
test_lex!(r#"\{-"#, Escape.l(1), Text.l(1), Seq(Hyphen).l(1));
|
2022-11-20 13:13:48 -05:00
|
|
|
}
|
|
|
|
|
2022-12-02 02:16:47 -05:00
|
|
|
#[test]
|
|
|
|
fn hardbreak() {
|
|
|
|
test_lex!("a\\\n", Text.l(1), Escape.l(1), Hardbreak.l(1));
|
|
|
|
test_lex!("a\\ \n", Text.l(1), Escape.l(1), Hardbreak.l(4));
|
|
|
|
test_lex!("a\\\t \t \n", Text.l(1), Escape.l(1), Hardbreak.l(5));
|
|
|
|
}
|
|
|
|
|
2022-11-20 13:13:48 -05:00
|
|
|
#[test]
|
|
|
|
fn delim() {
|
2022-11-21 17:32:28 -05:00
|
|
|
test_lex!("{-", Open(BraceHyphen).l(2));
|
|
|
|
test_lex!("-}", Close(BraceHyphen).l(2));
|
|
|
|
test_lex!("{++}", Open(BracePlus).l(2), Close(BracePlus).l(2));
|
2022-11-20 13:13:48 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn sym() {
|
|
|
|
test_lex!(
|
2023-01-27 11:13:07 -05:00
|
|
|
r#"'*^![<|"~_"#,
|
2022-11-21 17:32:28 -05:00
|
|
|
Sym(Quote1).l(1),
|
|
|
|
Sym(Asterisk).l(1),
|
|
|
|
Sym(Caret).l(1),
|
2022-12-17 12:03:06 -05:00
|
|
|
Sym(ExclaimBracket).l(2),
|
2022-11-21 17:32:28 -05:00
|
|
|
Sym(Lt).l(1),
|
|
|
|
Sym(Pipe).l(1),
|
|
|
|
Sym(Quote2).l(1),
|
|
|
|
Sym(Tilde).l(1),
|
|
|
|
Sym(Underscore).l(1),
|
|
|
|
);
|
|
|
|
test_lex!(
|
|
|
|
"''''",
|
|
|
|
Sym(Quote1).l(1),
|
|
|
|
Sym(Quote1).l(1),
|
|
|
|
Sym(Quote1).l(1),
|
|
|
|
Sym(Quote1).l(1),
|
2022-11-20 13:13:48 -05:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn seq() {
|
2022-11-21 17:32:28 -05:00
|
|
|
test_lex!("`", Seq(Backtick).l(1));
|
|
|
|
test_lex!("```", Seq(Backtick).l(3));
|
2022-11-20 13:13:48 -05:00
|
|
|
test_lex!(
|
2023-01-27 11:15:20 -05:00
|
|
|
"`$-.",
|
2022-11-21 17:32:28 -05:00
|
|
|
Seq(Backtick).l(1),
|
|
|
|
Seq(Dollar).l(1),
|
|
|
|
Seq(Hyphen).l(1),
|
|
|
|
Seq(Period).l(1),
|
2022-11-20 13:13:48 -05:00
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|