This commit is contained in:
Noah Hellman 2022-11-21 23:32:28 +01:00
parent 8bd9323c48
commit a41673a3b6
2 changed files with 145 additions and 126 deletions

View file

@ -92,9 +92,11 @@ impl Parser {
} }
pub fn parse<'a>(&'a mut self, src: &'a str) -> impl Iterator<Item = Event> + 'a { pub fn parse<'a>(&'a mut self, src: &'a str) -> impl Iterator<Item = Event> + 'a {
std::iter::from_fn(|| { let mut lexer = lex::Lexer::new(src).peekable();
std::iter::from_fn(move || {
dbg!(&src);
if self.events.is_empty() { if self.events.is_empty() {
Parse::new(src, &mut self.openers, &mut self.events).parse(); Parse::new(&mut lexer, &mut self.openers, &mut self.events).parse();
} }
self.events.pop() self.events.pop()
@ -102,34 +104,26 @@ impl Parser {
} }
} }
struct Parse<'s> { struct Parse<'l, 's, 'e> {
lexer: lex::Lexer<'s>, tokens: &'l mut std::iter::Peekable<lex::Lexer<'s>>,
openers: &'s mut Vec<Container>, openers: &'e mut Vec<Container>,
events: &'s mut Vec<Event>, events: &'e mut Vec<Event>,
/// Next token to be eaten.
next_token: lex::Token,
/// Position after `next_token`.
pos: usize,
/// Span of last eaten token.
span: Span,
} }
impl<'s> Parse<'s> { impl<'l, 's, 'e> Parse<'l, 's, 'e> {
fn new(src: &'s str, openers: &'s mut Vec<Container>, events: &'s mut Vec<Event>) -> Self { fn new(
let mut lexer = lex::Lexer::new(src); tokens: &'l mut std::iter::Peekable<lex::Lexer<'s>>,
let next_token = lexer.next_token(); openers: &'e mut Vec<Container>,
let pos = next_token.len; events: &'e mut Vec<Event>,
) -> Self {
Self { Self {
lexer, tokens,
openers, openers,
events, events,
next_token,
pos,
span: Span::new(0, 0),
} }
} }
/*
fn step(&mut self) -> lex::Token { fn step(&mut self) -> lex::Token {
let token = self.lexer.next_token(); let token = self.lexer.next_token();
dbg!(&token, self.pos); dbg!(&token, self.pos);
@ -147,50 +141,56 @@ impl<'s> Parse<'s> {
fn peek(&mut self) -> &lex::Kind { fn peek(&mut self) -> &lex::Kind {
&self.next_token.kind &self.next_token.kind
} }
*/
fn peek(&mut self) -> Option<&lex::Kind> {
self.tokens.peek().map(|t| &t.kind)
}
fn parse(&mut self) { fn parse(&mut self) {
let mut kind = self.eat(); let mut t = if let Some(t) = self.tokens.next() {
t
} else {
return;
};
//dbg!(&kind); //dbg!(&kind);
if kind == lex::Kind::Eof {
return;
}
{ {
let verbatim_opt = match kind { let verbatim_opt = match t.kind {
lex::Kind::Seq(lex::Sequence::Dollar) => { lex::Kind::Seq(lex::Sequence::Dollar) => {
let math_opt = (self.span.len() <= 2) let math_opt = (t.len <= 2)
.then(|| { .then(|| {
if let lex::Kind::Seq(lex::Sequence::Backtick) = self.peek() { if let Some(lex::Kind::Seq(lex::Sequence::Backtick)) = self.peek() {
Some((DisplayMath, self.span.len())) Some((DisplayMath, t.len))
} else { } else {
None None
} }
}) })
.flatten(); .flatten();
if math_opt.is_some() { if math_opt.is_some() {
self.eat(); // backticks self.tokens.next(); // backticks
} }
math_opt math_opt
} }
lex::Kind::Seq(lex::Sequence::Backtick) => Some((Verbatim, self.span.len())), lex::Kind::Seq(lex::Sequence::Backtick) => Some((Verbatim, t.len)),
_ => None, _ => None,
}; };
if let Some((atom, opener_len)) = verbatim_opt { if let Some((atom, opener_len)) = verbatim_opt {
while !matches!(kind, lex::Kind::Seq(lex::Sequence::Backtick)) for tok in &mut self.tokens {
|| self.span.len() != opener_len if matches!(tok.kind, lex::Kind::Seq(lex::Sequence::Backtick))
&& tok.len == opener_len
{ {
kind = self.eat();
}
self.events.push(Event::Atom(atom)); self.events.push(Event::Atom(atom));
return; return;
} }
} }
}
}
{ {
let container_opt = match kind { let container_opt = match t.kind {
lex::Kind::Sym(Symbol::Asterisk) => Some((Strong, Dir::Both)), lex::Kind::Sym(Symbol::Asterisk) => Some((Strong, Dir::Both)),
lex::Kind::Sym(Symbol::Underscore) => Some((Emphasis, Dir::Both)), lex::Kind::Sym(Symbol::Underscore) => Some((Emphasis, Dir::Both)),
lex::Kind::Sym(Symbol::Caret) => Some((Superscript, Dir::Both)), lex::Kind::Sym(Symbol::Caret) => Some((Superscript, Dir::Both)),
@ -235,7 +235,7 @@ impl<'s> Parse<'s> {
} }
{ {
if let lex::Kind::Open(Delimiter::Brace) = kind { if let lex::Kind::Open(Delimiter::Brace) = t.kind {
todo!(); // check for attr todo!(); // check for attr
} }
} }

View file

@ -5,7 +5,7 @@ use Kind::*;
use Sequence::*; use Sequence::*;
use Symbol::*; use Symbol::*;
#[derive(Debug)] #[derive(Debug, PartialEq, Eq)]
pub(crate) struct Token { pub(crate) struct Token {
pub kind: Kind, pub kind: Kind,
pub len: usize, pub len: usize,
@ -84,6 +84,7 @@ pub(crate) struct Lexer<'s> {
chars: std::str::Chars<'s>, chars: std::str::Chars<'s>,
escape: bool, escape: bool,
next: Option<Token>, next: Option<Token>,
len: usize,
} }
impl<'s> Lexer<'s> { impl<'s> Lexer<'s> {
@ -93,25 +94,7 @@ impl<'s> Lexer<'s> {
chars: src.chars(), chars: src.chars(),
escape: false, escape: false,
next: None, next: None,
} len: 0,
}
pub fn next_token(&mut self) -> Token {
if let Some(token) = self.next.take() {
token
} else {
let mut current = self.token();
// concatenate text tokens
if let Token { kind: Text, len } = &mut current {
self.next = Some(self.token());
while let Some(Token { kind: Text, len: l }) = self.next {
*len += l;
self.next = Some(self.token());
}
}
current
} }
} }
@ -119,12 +102,10 @@ impl<'s> Lexer<'s> {
self.chars.clone().next().unwrap_or(EOF) self.chars.clone().next().unwrap_or(EOF)
} }
fn eat(&mut self) -> char { fn eat(&mut self) -> Option<char> {
self.chars.next().unwrap_or(EOF) let c = self.chars.next();
} self.len += c.map_or(0, char::len_utf8);
c
fn len(&self) -> usize {
self.src.len() - self.chars.as_str().len()
} }
fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
@ -133,14 +114,14 @@ impl<'s> Lexer<'s> {
} }
} }
fn token(&mut self) -> Token { fn token(&mut self) -> Option<Token> {
let first = self.eat(); self.len = 0;
let first = self.eat()?;
let escape = self.escape; let escape = self.escape;
let kind = match first { let kind = match first {
EOF => Eof,
_ if escape && first == ' ' => Nbsp, _ if escape && first == ' ' => Nbsp,
_ if escape => Text, _ if escape => Text,
@ -222,9 +203,10 @@ impl<'s> Lexer<'s> {
self.escape = false; self.escape = false;
} }
let len = self.len(); Some(Token {
kind,
Token { kind, len } len: self.len,
})
} }
fn eat_seq(&mut self, s: Sequence) -> Kind { fn eat_seq(&mut self, s: Sequence) -> Kind {
@ -242,6 +224,29 @@ impl<'s> Lexer<'s> {
} }
} }
impl<'s> Iterator for Lexer<'s> {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
if let Some(token) = self.next.take() {
Some(token)
} else {
let mut current = self.token();
// concatenate text tokens
if let Some(Token { kind: Text, len }) = &mut current {
self.next = self.token();
while let Some(Token { kind: Text, len: l }) = self.next {
*len += l;
self.next = self.token();
}
}
current
}
}
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::Delimiter::*; use super::Delimiter::*;
@ -249,6 +254,7 @@ mod test {
use super::Sequence::*; use super::Sequence::*;
use super::Symbol::*; use super::Symbol::*;
/*
fn tokenize(src: &str) -> impl Iterator<Item = super::Token> + '_ { fn tokenize(src: &str) -> impl Iterator<Item = super::Token> + '_ {
let mut lexer = super::Lexer::new(src); let mut lexer = super::Lexer::new(src);
std::iter::from_fn(move || { std::iter::from_fn(move || {
@ -260,16 +266,23 @@ mod test {
} }
}) })
} }
*/
macro_rules! test_lex { macro_rules! test_lex {
($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => { ($($st:ident,)? $src:expr $(,$($token:expr),* $(,)?)?) => {
#[allow(unused)] #[allow(unused)]
let actual = tokenize($src).map(|t| t.kind).collect::<Vec<_>>(); let actual = super::Lexer::new($src).collect::<Vec<_>>();
let expected = vec![$($($token),*,)?]; let expected = vec![$($($token),*,)?];
assert_eq!(actual, expected, "{}", $src); assert_eq!(actual, expected, "{}", $src);
}; };
} }
impl super::Kind {
fn l(self, len: usize) -> super::Token {
super::Token { kind: self, len }
}
}
#[test] #[test]
fn empty() { fn empty() {
test_lex!(""); test_lex!("");
@ -277,85 +290,91 @@ mod test {
#[test] #[test]
fn basic() { fn basic() {
test_lex!("abc", Text); test_lex!("abc", Text.l(3));
test_lex!( test_lex!(
"para w/ some _emphasis_ and *strong*.", "para w/ some _emphasis_ and *strong*.",
Text, Text.l(4),
Whitespace, Whitespace.l(1),
Text, Text.l(2),
Whitespace, Whitespace.l(1),
Text, Text.l(4),
Whitespace, Whitespace.l(1),
Sym(Underscore), Sym(Underscore).l(1),
Text, Text.l(8),
Sym(Underscore), Sym(Underscore).l(1),
Whitespace, Whitespace.l(1),
Text, Text.l(3),
Whitespace, Whitespace.l(1),
Sym(Asterisk), Sym(Asterisk).l(1),
Text, Text.l(6),
Sym(Asterisk), Sym(Asterisk).l(1),
Seq(Period) Seq(Period).l(1),
); );
} }
#[test] #[test]
fn escape() { fn escape() {
test_lex!(r#"\a"#, Text); test_lex!(r#"\a"#, Text.l(2));
test_lex!(r#"\\a"#, Escape, Text); test_lex!(r#"\\a"#, Escape.l(1), Text.l(2));
test_lex!(r#"\."#, Escape, Text); test_lex!(r#"\."#, Escape.l(1), Text.l(1));
test_lex!(r#"\ "#, Escape, Nbsp); test_lex!(r#"\ "#, Escape.l(1), Nbsp.l(1));
test_lex!(r#"\{-"#, Escape, Text, Seq(Hyphen)); test_lex!(r#"\{-"#, Escape.l(1), Text.l(1), Seq(Hyphen).l(1));
} }
#[test] #[test]
fn delim() { fn delim() {
test_lex!("{-", Open(BraceHyphen)); test_lex!("{-", Open(BraceHyphen).l(2));
test_lex!("-}", Close(BraceHyphen)); test_lex!("-}", Close(BraceHyphen).l(2));
test_lex!("{++}", Open(BracePlus), Close(BracePlus)); test_lex!("{++}", Open(BracePlus).l(2), Close(BracePlus).l(2));
} }
#[test] #[test]
fn sym() { fn sym() {
test_lex!( test_lex!(
r#"'*^=!><%|+"~_"#, r#"'*^=!><%|+"~_"#,
Sym(Quote1), Sym(Quote1).l(1),
Sym(Asterisk), Sym(Asterisk).l(1),
Sym(Caret), Sym(Caret).l(1),
Sym(Equal), Sym(Equal).l(1),
Sym(Exclaim), Sym(Exclaim).l(1),
Sym(Gt), Sym(Gt).l(1),
Sym(Lt), Sym(Lt).l(1),
Sym(Percentage), Sym(Percentage).l(1),
Sym(Pipe), Sym(Pipe).l(1),
Sym(Plus), Sym(Plus).l(1),
Sym(Quote2), Sym(Quote2).l(1),
Sym(Tilde), Sym(Tilde).l(1),
Sym(Underscore), Sym(Underscore).l(1),
);
test_lex!(
"''''",
Sym(Quote1).l(1),
Sym(Quote1).l(1),
Sym(Quote1).l(1),
Sym(Quote1).l(1),
); );
test_lex!("''''", Sym(Quote1), Sym(Quote1), Sym(Quote1), Sym(Quote1),);
} }
#[test] #[test]
fn seq() { fn seq() {
test_lex!("`", Seq(Backtick)); test_lex!("`", Seq(Backtick).l(1));
test_lex!("```", Seq(Backtick)); test_lex!("```", Seq(Backtick).l(3));
test_lex!( test_lex!(
"`:$#-.", "`:$#-.",
Seq(Backtick), Seq(Backtick).l(1),
Seq(Colon), Seq(Colon).l(1),
Seq(Dollar), Seq(Dollar).l(1),
Seq(Hash), Seq(Hash).l(1),
Seq(Hyphen), Seq(Hyphen).l(1),
Seq(Period), Seq(Period).l(1),
); );
} }
#[test] #[test]
fn int() { fn int() {
test_lex!("1", Integer); test_lex!("1", Integer.l(1));
test_lex!("123", Integer); test_lex!("123", Integer.l(3));
test_lex!("1234567890", Integer); test_lex!("1234567890", Integer.l(10));
test_lex!("000", Integer); test_lex!("000", Integer.l(3));
} }
} }