lex
This commit is contained in:
parent
8bd9323c48
commit
a41673a3b6
2 changed files with 145 additions and 126 deletions
|
@ -92,9 +92,11 @@ impl Parser {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse<'a>(&'a mut self, src: &'a str) -> impl Iterator<Item = Event> + 'a {
|
pub fn parse<'a>(&'a mut self, src: &'a str) -> impl Iterator<Item = Event> + 'a {
|
||||||
std::iter::from_fn(|| {
|
let mut lexer = lex::Lexer::new(src).peekable();
|
||||||
|
std::iter::from_fn(move || {
|
||||||
|
dbg!(&src);
|
||||||
if self.events.is_empty() {
|
if self.events.is_empty() {
|
||||||
Parse::new(src, &mut self.openers, &mut self.events).parse();
|
Parse::new(&mut lexer, &mut self.openers, &mut self.events).parse();
|
||||||
}
|
}
|
||||||
|
|
||||||
self.events.pop()
|
self.events.pop()
|
||||||
|
@ -102,34 +104,26 @@ impl Parser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Parse<'s> {
|
struct Parse<'l, 's, 'e> {
|
||||||
lexer: lex::Lexer<'s>,
|
tokens: &'l mut std::iter::Peekable<lex::Lexer<'s>>,
|
||||||
openers: &'s mut Vec<Container>,
|
openers: &'e mut Vec<Container>,
|
||||||
events: &'s mut Vec<Event>,
|
events: &'e mut Vec<Event>,
|
||||||
|
|
||||||
/// Next token to be eaten.
|
|
||||||
next_token: lex::Token,
|
|
||||||
/// Position after `next_token`.
|
|
||||||
pos: usize,
|
|
||||||
/// Span of last eaten token.
|
|
||||||
span: Span,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'s> Parse<'s> {
|
impl<'l, 's, 'e> Parse<'l, 's, 'e> {
|
||||||
fn new(src: &'s str, openers: &'s mut Vec<Container>, events: &'s mut Vec<Event>) -> Self {
|
fn new(
|
||||||
let mut lexer = lex::Lexer::new(src);
|
tokens: &'l mut std::iter::Peekable<lex::Lexer<'s>>,
|
||||||
let next_token = lexer.next_token();
|
openers: &'e mut Vec<Container>,
|
||||||
let pos = next_token.len;
|
events: &'e mut Vec<Event>,
|
||||||
|
) -> Self {
|
||||||
Self {
|
Self {
|
||||||
lexer,
|
tokens,
|
||||||
openers,
|
openers,
|
||||||
events,
|
events,
|
||||||
next_token,
|
|
||||||
pos,
|
|
||||||
span: Span::new(0, 0),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
fn step(&mut self) -> lex::Token {
|
fn step(&mut self) -> lex::Token {
|
||||||
let token = self.lexer.next_token();
|
let token = self.lexer.next_token();
|
||||||
dbg!(&token, self.pos);
|
dbg!(&token, self.pos);
|
||||||
|
@ -147,50 +141,56 @@ impl<'s> Parse<'s> {
|
||||||
fn peek(&mut self) -> &lex::Kind {
|
fn peek(&mut self) -> &lex::Kind {
|
||||||
&self.next_token.kind
|
&self.next_token.kind
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
fn peek(&mut self) -> Option<&lex::Kind> {
|
||||||
|
self.tokens.peek().map(|t| &t.kind)
|
||||||
|
}
|
||||||
|
|
||||||
fn parse(&mut self) {
|
fn parse(&mut self) {
|
||||||
let mut kind = self.eat();
|
let mut t = if let Some(t) = self.tokens.next() {
|
||||||
|
t
|
||||||
|
} else {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
//dbg!(&kind);
|
//dbg!(&kind);
|
||||||
|
|
||||||
if kind == lex::Kind::Eof {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
{
|
||||||
let verbatim_opt = match kind {
|
let verbatim_opt = match t.kind {
|
||||||
lex::Kind::Seq(lex::Sequence::Dollar) => {
|
lex::Kind::Seq(lex::Sequence::Dollar) => {
|
||||||
let math_opt = (self.span.len() <= 2)
|
let math_opt = (t.len <= 2)
|
||||||
.then(|| {
|
.then(|| {
|
||||||
if let lex::Kind::Seq(lex::Sequence::Backtick) = self.peek() {
|
if let Some(lex::Kind::Seq(lex::Sequence::Backtick)) = self.peek() {
|
||||||
Some((DisplayMath, self.span.len()))
|
Some((DisplayMath, t.len))
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.flatten();
|
.flatten();
|
||||||
if math_opt.is_some() {
|
if math_opt.is_some() {
|
||||||
self.eat(); // backticks
|
self.tokens.next(); // backticks
|
||||||
}
|
}
|
||||||
math_opt
|
math_opt
|
||||||
}
|
}
|
||||||
lex::Kind::Seq(lex::Sequence::Backtick) => Some((Verbatim, self.span.len())),
|
lex::Kind::Seq(lex::Sequence::Backtick) => Some((Verbatim, t.len)),
|
||||||
_ => None,
|
_ => None,
|
||||||
};
|
};
|
||||||
|
|
||||||
if let Some((atom, opener_len)) = verbatim_opt {
|
if let Some((atom, opener_len)) = verbatim_opt {
|
||||||
while !matches!(kind, lex::Kind::Seq(lex::Sequence::Backtick))
|
for tok in &mut self.tokens {
|
||||||
|| self.span.len() != opener_len
|
if matches!(tok.kind, lex::Kind::Seq(lex::Sequence::Backtick))
|
||||||
|
&& tok.len == opener_len
|
||||||
{
|
{
|
||||||
kind = self.eat();
|
|
||||||
}
|
|
||||||
self.events.push(Event::Atom(atom));
|
self.events.push(Event::Atom(atom));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
let container_opt = match kind {
|
let container_opt = match t.kind {
|
||||||
lex::Kind::Sym(Symbol::Asterisk) => Some((Strong, Dir::Both)),
|
lex::Kind::Sym(Symbol::Asterisk) => Some((Strong, Dir::Both)),
|
||||||
lex::Kind::Sym(Symbol::Underscore) => Some((Emphasis, Dir::Both)),
|
lex::Kind::Sym(Symbol::Underscore) => Some((Emphasis, Dir::Both)),
|
||||||
lex::Kind::Sym(Symbol::Caret) => Some((Superscript, Dir::Both)),
|
lex::Kind::Sym(Symbol::Caret) => Some((Superscript, Dir::Both)),
|
||||||
|
@ -235,7 +235,7 @@ impl<'s> Parse<'s> {
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
if let lex::Kind::Open(Delimiter::Brace) = kind {
|
if let lex::Kind::Open(Delimiter::Brace) = t.kind {
|
||||||
todo!(); // check for attr
|
todo!(); // check for attr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
189
src/lex.rs
189
src/lex.rs
|
@ -5,7 +5,7 @@ use Kind::*;
|
||||||
use Sequence::*;
|
use Sequence::*;
|
||||||
use Symbol::*;
|
use Symbol::*;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
pub(crate) struct Token {
|
pub(crate) struct Token {
|
||||||
pub kind: Kind,
|
pub kind: Kind,
|
||||||
pub len: usize,
|
pub len: usize,
|
||||||
|
@ -84,6 +84,7 @@ pub(crate) struct Lexer<'s> {
|
||||||
chars: std::str::Chars<'s>,
|
chars: std::str::Chars<'s>,
|
||||||
escape: bool,
|
escape: bool,
|
||||||
next: Option<Token>,
|
next: Option<Token>,
|
||||||
|
len: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'s> Lexer<'s> {
|
impl<'s> Lexer<'s> {
|
||||||
|
@ -93,25 +94,7 @@ impl<'s> Lexer<'s> {
|
||||||
chars: src.chars(),
|
chars: src.chars(),
|
||||||
escape: false,
|
escape: false,
|
||||||
next: None,
|
next: None,
|
||||||
}
|
len: 0,
|
||||||
}
|
|
||||||
|
|
||||||
pub fn next_token(&mut self) -> Token {
|
|
||||||
if let Some(token) = self.next.take() {
|
|
||||||
token
|
|
||||||
} else {
|
|
||||||
let mut current = self.token();
|
|
||||||
|
|
||||||
// concatenate text tokens
|
|
||||||
if let Token { kind: Text, len } = &mut current {
|
|
||||||
self.next = Some(self.token());
|
|
||||||
while let Some(Token { kind: Text, len: l }) = self.next {
|
|
||||||
*len += l;
|
|
||||||
self.next = Some(self.token());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
current
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -119,12 +102,10 @@ impl<'s> Lexer<'s> {
|
||||||
self.chars.clone().next().unwrap_or(EOF)
|
self.chars.clone().next().unwrap_or(EOF)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn eat(&mut self) -> char {
|
fn eat(&mut self) -> Option<char> {
|
||||||
self.chars.next().unwrap_or(EOF)
|
let c = self.chars.next();
|
||||||
}
|
self.len += c.map_or(0, char::len_utf8);
|
||||||
|
c
|
||||||
fn len(&self) -> usize {
|
|
||||||
self.src.len() - self.chars.as_str().len()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
|
fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
|
||||||
|
@ -133,14 +114,14 @@ impl<'s> Lexer<'s> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn token(&mut self) -> Token {
|
fn token(&mut self) -> Option<Token> {
|
||||||
let first = self.eat();
|
self.len = 0;
|
||||||
|
|
||||||
|
let first = self.eat()?;
|
||||||
|
|
||||||
let escape = self.escape;
|
let escape = self.escape;
|
||||||
|
|
||||||
let kind = match first {
|
let kind = match first {
|
||||||
EOF => Eof,
|
|
||||||
|
|
||||||
_ if escape && first == ' ' => Nbsp,
|
_ if escape && first == ' ' => Nbsp,
|
||||||
_ if escape => Text,
|
_ if escape => Text,
|
||||||
|
|
||||||
|
@ -222,9 +203,10 @@ impl<'s> Lexer<'s> {
|
||||||
self.escape = false;
|
self.escape = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
let len = self.len();
|
Some(Token {
|
||||||
|
kind,
|
||||||
Token { kind, len }
|
len: self.len,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn eat_seq(&mut self, s: Sequence) -> Kind {
|
fn eat_seq(&mut self, s: Sequence) -> Kind {
|
||||||
|
@ -242,6 +224,29 @@ impl<'s> Lexer<'s> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'s> Iterator for Lexer<'s> {
|
||||||
|
type Item = Token;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
if let Some(token) = self.next.take() {
|
||||||
|
Some(token)
|
||||||
|
} else {
|
||||||
|
let mut current = self.token();
|
||||||
|
|
||||||
|
// concatenate text tokens
|
||||||
|
if let Some(Token { kind: Text, len }) = &mut current {
|
||||||
|
self.next = self.token();
|
||||||
|
while let Some(Token { kind: Text, len: l }) = self.next {
|
||||||
|
*len += l;
|
||||||
|
self.next = self.token();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
current
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::Delimiter::*;
|
use super::Delimiter::*;
|
||||||
|
@ -249,6 +254,7 @@ mod test {
|
||||||
use super::Sequence::*;
|
use super::Sequence::*;
|
||||||
use super::Symbol::*;
|
use super::Symbol::*;
|
||||||
|
|
||||||
|
/*
|
||||||
fn tokenize(src: &str) -> impl Iterator<Item = super::Token> + '_ {
|
fn tokenize(src: &str) -> impl Iterator<Item = super::Token> + '_ {
|
||||||
let mut lexer = super::Lexer::new(src);
|
let mut lexer = super::Lexer::new(src);
|
||||||
std::iter::from_fn(move || {
|
std::iter::from_fn(move || {
|
||||||
|
@ -260,16 +266,23 @@ mod test {
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
macro_rules! test_lex {
|
macro_rules! test_lex {
|
||||||
($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => {
|
($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => {
|
||||||
#[allow(unused)]
|
#[allow(unused)]
|
||||||
let actual = tokenize($src).map(|t| t.kind).collect::<Vec<_>>();
|
let actual = super::Lexer::new($src).collect::<Vec<_>>();
|
||||||
let expected = vec![$($($token),*,)?];
|
let expected = vec![$($($token),*,)?];
|
||||||
assert_eq!(actual, expected, "{}", $src);
|
assert_eq!(actual, expected, "{}", $src);
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl super::Kind {
|
||||||
|
fn l(self, len: usize) -> super::Token {
|
||||||
|
super::Token { kind: self, len }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn empty() {
|
fn empty() {
|
||||||
test_lex!("");
|
test_lex!("");
|
||||||
|
@ -277,85 +290,91 @@ mod test {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn basic() {
|
fn basic() {
|
||||||
test_lex!("abc", Text);
|
test_lex!("abc", Text.l(3));
|
||||||
test_lex!(
|
test_lex!(
|
||||||
"para w/ some _emphasis_ and *strong*.",
|
"para w/ some _emphasis_ and *strong*.",
|
||||||
Text,
|
Text.l(4),
|
||||||
Whitespace,
|
Whitespace.l(1),
|
||||||
Text,
|
Text.l(2),
|
||||||
Whitespace,
|
Whitespace.l(1),
|
||||||
Text,
|
Text.l(4),
|
||||||
Whitespace,
|
Whitespace.l(1),
|
||||||
Sym(Underscore),
|
Sym(Underscore).l(1),
|
||||||
Text,
|
Text.l(8),
|
||||||
Sym(Underscore),
|
Sym(Underscore).l(1),
|
||||||
Whitespace,
|
Whitespace.l(1),
|
||||||
Text,
|
Text.l(3),
|
||||||
Whitespace,
|
Whitespace.l(1),
|
||||||
Sym(Asterisk),
|
Sym(Asterisk).l(1),
|
||||||
Text,
|
Text.l(6),
|
||||||
Sym(Asterisk),
|
Sym(Asterisk).l(1),
|
||||||
Seq(Period)
|
Seq(Period).l(1),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn escape() {
|
fn escape() {
|
||||||
test_lex!(r#"\a"#, Text);
|
test_lex!(r#"\a"#, Text.l(2));
|
||||||
test_lex!(r#"\\a"#, Escape, Text);
|
test_lex!(r#"\\a"#, Escape.l(1), Text.l(2));
|
||||||
test_lex!(r#"\."#, Escape, Text);
|
test_lex!(r#"\."#, Escape.l(1), Text.l(1));
|
||||||
test_lex!(r#"\ "#, Escape, Nbsp);
|
test_lex!(r#"\ "#, Escape.l(1), Nbsp.l(1));
|
||||||
test_lex!(r#"\{-"#, Escape, Text, Seq(Hyphen));
|
test_lex!(r#"\{-"#, Escape.l(1), Text.l(1), Seq(Hyphen).l(1));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn delim() {
|
fn delim() {
|
||||||
test_lex!("{-", Open(BraceHyphen));
|
test_lex!("{-", Open(BraceHyphen).l(2));
|
||||||
test_lex!("-}", Close(BraceHyphen));
|
test_lex!("-}", Close(BraceHyphen).l(2));
|
||||||
test_lex!("{++}", Open(BracePlus), Close(BracePlus));
|
test_lex!("{++}", Open(BracePlus).l(2), Close(BracePlus).l(2));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn sym() {
|
fn sym() {
|
||||||
test_lex!(
|
test_lex!(
|
||||||
r#"'*^=!><%|+"~_"#,
|
r#"'*^=!><%|+"~_"#,
|
||||||
Sym(Quote1),
|
Sym(Quote1).l(1),
|
||||||
Sym(Asterisk),
|
Sym(Asterisk).l(1),
|
||||||
Sym(Caret),
|
Sym(Caret).l(1),
|
||||||
Sym(Equal),
|
Sym(Equal).l(1),
|
||||||
Sym(Exclaim),
|
Sym(Exclaim).l(1),
|
||||||
Sym(Gt),
|
Sym(Gt).l(1),
|
||||||
Sym(Lt),
|
Sym(Lt).l(1),
|
||||||
Sym(Percentage),
|
Sym(Percentage).l(1),
|
||||||
Sym(Pipe),
|
Sym(Pipe).l(1),
|
||||||
Sym(Plus),
|
Sym(Plus).l(1),
|
||||||
Sym(Quote2),
|
Sym(Quote2).l(1),
|
||||||
Sym(Tilde),
|
Sym(Tilde).l(1),
|
||||||
Sym(Underscore),
|
Sym(Underscore).l(1),
|
||||||
|
);
|
||||||
|
test_lex!(
|
||||||
|
"''''",
|
||||||
|
Sym(Quote1).l(1),
|
||||||
|
Sym(Quote1).l(1),
|
||||||
|
Sym(Quote1).l(1),
|
||||||
|
Sym(Quote1).l(1),
|
||||||
);
|
);
|
||||||
test_lex!("''''", Sym(Quote1), Sym(Quote1), Sym(Quote1), Sym(Quote1),);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn seq() {
|
fn seq() {
|
||||||
test_lex!("`", Seq(Backtick));
|
test_lex!("`", Seq(Backtick).l(1));
|
||||||
test_lex!("```", Seq(Backtick));
|
test_lex!("```", Seq(Backtick).l(3));
|
||||||
test_lex!(
|
test_lex!(
|
||||||
"`:$#-.",
|
"`:$#-.",
|
||||||
Seq(Backtick),
|
Seq(Backtick).l(1),
|
||||||
Seq(Colon),
|
Seq(Colon).l(1),
|
||||||
Seq(Dollar),
|
Seq(Dollar).l(1),
|
||||||
Seq(Hash),
|
Seq(Hash).l(1),
|
||||||
Seq(Hyphen),
|
Seq(Hyphen).l(1),
|
||||||
Seq(Period),
|
Seq(Period).l(1),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn int() {
|
fn int() {
|
||||||
test_lex!("1", Integer);
|
test_lex!("1", Integer.l(1));
|
||||||
test_lex!("123", Integer);
|
test_lex!("123", Integer.l(3));
|
||||||
test_lex!("1234567890", Integer);
|
test_lex!("1234567890", Integer.l(10));
|
||||||
test_lex!("000", Integer);
|
test_lex!("000", Integer.l(3));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue