jotdown/src/lex.rs

410 lines
10 KiB
Rust
Raw Normal View History

2022-11-20 13:13:48 -05:00
use Delimiter::*;
2022-11-21 13:56:11 -05:00
use Kind::*;
2022-11-20 13:13:48 -05:00
use Sequence::*;
use Symbol::*;
2022-11-22 13:19:21 -05:00
#[derive(Debug, Clone, PartialEq, Eq)]
2022-11-20 13:13:48 -05:00
pub(crate) struct Token {
2022-11-21 13:44:59 -05:00
pub kind: Kind,
2022-11-20 13:13:48 -05:00
pub len: usize,
}
2022-12-24 05:18:15 -05:00
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2022-11-21 13:44:59 -05:00
pub enum Kind {
2022-11-20 13:13:48 -05:00
Text,
2022-12-01 12:09:09 -05:00
Newline,
2022-11-20 13:13:48 -05:00
Whitespace,
Nbsp,
2022-12-01 12:09:09 -05:00
Hardbreak,
2022-11-20 13:13:48 -05:00
Escape,
Open(Delimiter),
Close(Delimiter),
Sym(Symbol),
Seq(Sequence),
DollarBacktick(u8),
2022-11-20 13:13:48 -05:00
}
2022-11-22 13:19:21 -05:00
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2022-11-20 13:13:48 -05:00
pub enum Delimiter {
Brace,
BraceAsterisk,
BraceCaret,
2022-11-27 17:56:19 -05:00
BraceEqual,
2022-11-20 13:13:48 -05:00
BraceHyphen,
BracePlus,
BraceTilde,
BraceUnderscore,
Bracket,
2023-01-27 13:04:01 -05:00
BraceQuote1,
BraceQuote2,
Paren,
2022-11-20 13:13:48 -05:00
}
2022-11-22 13:19:21 -05:00
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2022-11-20 13:13:48 -05:00
pub enum Symbol {
Asterisk,
Caret,
2022-12-17 12:03:06 -05:00
ExclaimBracket,
2022-11-20 13:13:48 -05:00
Lt,
Pipe,
Quote1,
Quote2,
Tilde,
Underscore,
2023-02-04 11:10:38 -05:00
Colon,
2022-11-20 13:13:48 -05:00
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Sequence {
Backtick,
Hyphen,
Period,
}
impl Sequence {
fn ch(self) -> char {
match self {
Self::Backtick => '`',
Self::Period => '.',
Self::Hyphen => '-',
}
}
}
2022-11-22 13:19:21 -05:00
#[derive(Clone)]
pub(crate) struct Lexer<'s> {
src: &'s str,
chars: std::str::Chars<'s>,
2022-12-11 04:45:05 -05:00
/// Next character should be escaped.
2022-11-20 13:13:48 -05:00
escape: bool,
2022-12-11 04:45:05 -05:00
/// Token to be peeked or next'ed.
2022-11-20 13:13:48 -05:00
next: Option<Token>,
2022-12-11 04:45:05 -05:00
/// Length of current token.
2022-11-21 17:32:28 -05:00
len: usize,
2022-11-20 13:13:48 -05:00
}
impl<'s> Lexer<'s> {
pub fn new(src: &'s str) -> Self {
2022-11-20 13:13:48 -05:00
Lexer {
src,
chars: src.chars(),
2022-11-20 13:13:48 -05:00
escape: false,
next: None,
2022-11-21 17:32:28 -05:00
len: 0,
2022-11-21 16:40:11 -05:00
}
}
2023-01-17 16:36:10 -05:00
/// NOTE: Peeked [`Kind::Text`] tokens are only one char long, they may be longer when
/// consumed.
2022-12-11 03:25:35 -05:00
pub fn peek(&mut self) -> Option<&Token> {
if self.next.is_none() {
2023-01-17 16:36:10 -05:00
self.next = self.token();
2022-12-11 03:25:35 -05:00
}
self.next.as_ref()
}
pub fn ahead(&self) -> &'s str {
let pos =
self.src.len() - self.chars.as_str().len() - self.next.as_ref().map_or(0, |t| t.len);
&self.src[pos..]
2022-12-11 14:49:57 -05:00
}
2022-12-11 03:37:57 -05:00
fn next_token(&mut self) -> Option<Token> {
let mut current = self.token();
// concatenate text tokens
if let Some(Token { kind: Text, len }) = &mut current {
self.next = self.token();
while let Some(Token { kind: Text, len: l }) = self.next {
*len += l;
self.next = self.token();
}
}
current
}
fn peek_char_n(&mut self, n: usize) -> Option<char> {
self.chars.clone().nth(n)
}
fn peek_char(&mut self) -> Option<char> {
self.peek_char_n(0)
2022-11-20 13:13:48 -05:00
}
2022-12-11 03:25:35 -05:00
fn eat_char(&mut self) -> Option<char> {
2022-11-21 17:32:28 -05:00
let c = self.chars.next();
self.len += c.map_or(0, char::len_utf8);
c
2022-11-20 13:13:48 -05:00
}
fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
while let Some(c) = self.peek_char() {
if predicate(c) {
self.eat_char();
} else {
break;
}
2022-11-20 13:13:48 -05:00
}
}
2022-11-21 17:32:28 -05:00
fn token(&mut self) -> Option<Token> {
self.len = 0;
2022-12-11 03:25:35 -05:00
let first = self.eat_char()?;
2022-11-20 13:13:48 -05:00
let escape = self.escape;
let kind = match first {
2022-12-01 12:09:09 -05:00
_ if escape && first == '\n' => Hardbreak,
_ if escape
&& matches!(first, '\t' | ' ')
&& self.chars.clone().find(|c| !matches!(c, ' ' | '\t')) == Some('\n') =>
{
2022-12-11 03:25:35 -05:00
while self.eat_char() != Some('\n') {}
Hardbreak
}
_ if escape && first == ' ' => Nbsp,
2022-11-20 13:13:48 -05:00
_ if escape => Text,
'\\' => {
if self
.peek_char()
.map_or(false, |c| c.is_whitespace() || c.is_ascii_punctuation())
{
2022-11-20 13:13:48 -05:00
self.escape = true;
Escape
} else {
Text
}
}
2022-12-01 12:09:09 -05:00
'\n' => Newline,
2022-11-20 13:13:48 -05:00
_ if first.is_whitespace() => {
self.eat_while(char::is_whitespace);
Whitespace
}
'[' => Open(Bracket),
']' => Close(Bracket),
'(' => Open(Paren),
')' => Close(Paren),
2022-11-20 13:13:48 -05:00
'{' => {
2022-12-11 03:25:35 -05:00
let explicit = match self.peek_char() {
Some('*') => Some(Open(BraceAsterisk)),
Some('^') => Some(Open(BraceCaret)),
Some('=') => Some(Open(BraceEqual)),
Some('-') => Some(Open(BraceHyphen)),
Some('+') => Some(Open(BracePlus)),
Some('~') => Some(Open(BraceTilde)),
Some('_') => Some(Open(BraceUnderscore)),
Some('\'') => Some(Open(BraceQuote1)),
Some('"') => Some(Open(BraceQuote2)),
2022-11-20 13:13:48 -05:00
_ => None,
};
if let Some(exp) = explicit {
2022-12-11 03:25:35 -05:00
self.eat_char();
2022-11-20 13:13:48 -05:00
exp
} else {
Open(Brace)
}
}
'}' => Close(Brace),
'*' => self.maybe_eat_close_brace(Sym(Asterisk), BraceAsterisk),
'^' => self.maybe_eat_close_brace(Sym(Caret), BraceCaret),
2023-01-27 11:05:45 -05:00
'=' => self.maybe_eat_close_brace(Text, BraceEqual),
2023-01-27 11:13:07 -05:00
'+' => self.maybe_eat_close_brace(Text, BracePlus),
'~' => self.maybe_eat_close_brace(Sym(Tilde), BraceTilde),
'_' => self.maybe_eat_close_brace(Sym(Underscore), BraceUnderscore),
2023-01-27 13:04:01 -05:00
'\'' => self.maybe_eat_close_brace(Sym(Quote1), BraceQuote1),
'"' => self.maybe_eat_close_brace(Sym(Quote2), BraceQuote2),
2022-11-20 13:13:48 -05:00
'-' => {
if self.peek_char() == Some('}') {
2022-12-11 03:25:35 -05:00
self.eat_char();
2022-11-20 13:13:48 -05:00
Close(BraceHyphen)
} else {
while self.peek_char() == Some('-') && self.peek_char_n(1) != Some('}') {
self.eat_char();
}
Seq(Hyphen)
2022-11-20 13:13:48 -05:00
}
}
'!' if self.peek_char() == Some('[') => {
2022-12-17 12:03:06 -05:00
self.eat_char();
Sym(ExclaimBracket)
}
2022-11-20 13:13:48 -05:00
'<' => Sym(Lt),
'|' => Sym(Pipe),
2023-02-04 11:10:38 -05:00
':' => Sym(Colon),
2022-11-20 13:13:48 -05:00
'`' => self.eat_seq(Backtick),
'.' => self.eat_seq(Period),
'$' => {
self.eat_while(|c| c == '$');
let mut n_ticks: u8 = 0;
self.eat_while(|c| {
if c == '`' {
if let Some(l) = n_ticks.checked_add(1) {
n_ticks = l;
return true;
}
}
false
});
DollarBacktick(n_ticks)
}
2022-11-20 13:13:48 -05:00
_ => Text,
};
if escape {
self.escape = false;
}
2022-11-21 17:32:28 -05:00
Some(Token {
kind,
len: self.len,
})
2022-11-20 13:13:48 -05:00
}
2022-11-21 13:44:59 -05:00
fn eat_seq(&mut self, s: Sequence) -> Kind {
2022-11-20 13:13:48 -05:00
self.eat_while(|c| c == s.ch());
Seq(s)
}
fn maybe_eat_close_brace(&mut self, kind: Kind, d: Delimiter) -> Kind {
if self.peek_char() == Some('}') {
2022-12-11 03:25:35 -05:00
self.eat_char();
2022-11-20 13:13:48 -05:00
Close(d)
} else {
kind
2022-11-20 13:13:48 -05:00
}
}
}
impl<'s> Iterator for Lexer<'s> {
2022-11-21 17:32:28 -05:00
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
self.next.take().or_else(|| self.next_token())
2022-11-21 17:32:28 -05:00
}
}
2022-11-20 13:13:48 -05:00
#[cfg(test)]
mod test {
use super::Delimiter::*;
2022-11-21 13:56:11 -05:00
use super::Kind::*;
2022-11-20 13:13:48 -05:00
use super::Sequence::*;
use super::Symbol::*;
macro_rules! test_lex {
($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => {
#[allow(unused)]
let actual = super::Lexer::new($src).collect::<Vec<_>>();
2022-11-20 13:13:48 -05:00
let expected = vec![$($($token),*,)?];
assert_eq!(actual, expected, "{}", $src);
};
}
2022-11-21 17:32:28 -05:00
impl super::Kind {
fn l(self, len: usize) -> super::Token {
super::Token { kind: self, len }
}
}
2022-11-20 13:13:48 -05:00
#[test]
fn empty() {
test_lex!("");
}
#[test]
fn basic() {
2022-11-21 17:32:28 -05:00
test_lex!("abc", Text.l(3));
2022-11-20 13:13:48 -05:00
test_lex!(
"para w/ some _emphasis_ and *strong*.",
2022-11-21 17:32:28 -05:00
Text.l(4),
Whitespace.l(1),
Text.l(2),
Whitespace.l(1),
Text.l(4),
Whitespace.l(1),
Sym(Underscore).l(1),
Text.l(8),
Sym(Underscore).l(1),
Whitespace.l(1),
Text.l(3),
Whitespace.l(1),
Sym(Asterisk).l(1),
Text.l(6),
Sym(Asterisk).l(1),
Seq(Period).l(1),
2022-11-20 13:13:48 -05:00
);
}
#[test]
fn escape() {
2022-11-21 17:32:28 -05:00
test_lex!(r#"\a"#, Text.l(2));
test_lex!(r#"\\a"#, Escape.l(1), Text.l(2));
test_lex!(r#"\."#, Escape.l(1), Text.l(1));
test_lex!(r#"\ "#, Escape.l(1), Nbsp.l(1));
test_lex!(r#"\{-"#, Escape.l(1), Text.l(1), Seq(Hyphen).l(1));
2022-11-20 13:13:48 -05:00
}
#[test]
fn hardbreak() {
test_lex!("a\\\n", Text.l(1), Escape.l(1), Hardbreak.l(1));
test_lex!("a\\ \n", Text.l(1), Escape.l(1), Hardbreak.l(4));
test_lex!("a\\\t \t \n", Text.l(1), Escape.l(1), Hardbreak.l(5));
}
2022-11-20 13:13:48 -05:00
#[test]
fn delim() {
2022-11-21 17:32:28 -05:00
test_lex!("{-", Open(BraceHyphen).l(2));
test_lex!("-}", Close(BraceHyphen).l(2));
test_lex!("{++}", Open(BracePlus).l(2), Close(BracePlus).l(2));
2022-11-20 13:13:48 -05:00
}
#[test]
fn sym() {
test_lex!(
2023-01-27 11:13:07 -05:00
r#"'*^![<|"~_"#,
2022-11-21 17:32:28 -05:00
Sym(Quote1).l(1),
Sym(Asterisk).l(1),
Sym(Caret).l(1),
2022-12-17 12:03:06 -05:00
Sym(ExclaimBracket).l(2),
2022-11-21 17:32:28 -05:00
Sym(Lt).l(1),
Sym(Pipe).l(1),
Sym(Quote2).l(1),
Sym(Tilde).l(1),
Sym(Underscore).l(1),
);
test_lex!(
"''''",
Sym(Quote1).l(1),
Sym(Quote1).l(1),
Sym(Quote1).l(1),
Sym(Quote1).l(1),
2022-11-20 13:13:48 -05:00
);
}
#[test]
fn seq() {
2022-11-21 17:32:28 -05:00
test_lex!("`", Seq(Backtick).l(1));
test_lex!("```", Seq(Backtick).l(3));
2022-11-20 13:13:48 -05:00
test_lex!(
"`-.",
2022-11-21 17:32:28 -05:00
Seq(Backtick).l(1),
Seq(Hyphen).l(1),
Seq(Period).l(1),
2022-11-20 13:13:48 -05:00
);
}
#[test]
fn dollar_backtick() {
test_lex!("$`", DollarBacktick(1).l(2));
test_lex!("$$$`", DollarBacktick(1).l(4));
test_lex!("$$````", DollarBacktick(4).l(6));
}
2022-11-20 13:13:48 -05:00
}