atomic state wip

This commit is contained in:
Noah Hellman 2022-12-11 12:26:40 +01:00
parent 463f146623
commit 227c86f4f0

View file

@ -59,9 +59,9 @@ pub struct Event {
} }
/// Current parsing state of elements that are not recursive, i.e. may not contain arbitrary inline /// Current parsing state of elements that are not recursive, i.e. may not contain arbitrary inline
/// elements, can only be one of these at a time. /// elements. There can only be one of these at a time, due to the non-recursion.
#[derive(Debug)] #[derive(Debug)]
enum State { enum AtomicState {
None, None,
/// Within a verbatim element, e.g. '$`xxxxx' /// Within a verbatim element, e.g. '$`xxxxx'
Verbatim { Verbatim {
@ -82,7 +82,7 @@ enum State {
ReferenceLinkTag, ReferenceLinkTag,
} }
impl State { impl AtomicState {
fn verbatim(&self) -> Option<(Container, usize, usize)> { fn verbatim(&self) -> Option<(Container, usize, usize)> {
if let Self::Verbatim { if let Self::Verbatim {
kind, kind,
@ -98,27 +98,34 @@ impl State {
} }
pub struct Parser<'s> { pub struct Parser<'s> {
openers: Vec<(Container, usize)>, /// The last inline element has been provided, finish current events.
events: std::collections::VecDeque<Event>,
span: Span,
lexer: lex::Lexer<'s>,
state: State,
last: bool, last: bool,
/// Lexer, hosting upcoming source.
lexer: lex::Lexer<'s>,
/// Span of current event.
span: Span,
/// State of non-recursive elements.
atomic_state: AtomicState,
/// Stack with kind and index of _potential_ openers for typesetting containers.
typesets: Vec<(Container, usize)>,
/// Stack with index of _potential_ span/link openers.
spans: Vec<usize>,
//attributes: Vec<(Span, usize)>,
/// Buffer queue for next events. Events are buffered until no modifications due to future
/// characters are needed.
events: std::collections::VecDeque<Event>,
} }
impl<'s> Parser<'s> { impl<'s> Parser<'s> {
pub fn new() -> Self { pub fn new() -> Self {
Self { Self {
openers: Vec::new(),
events: std::collections::VecDeque::new(),
span: Span::new(0, 0),
lexer: lex::Lexer::new(""),
state: State::None,
last: false, last: false,
lexer: lex::Lexer::new(""),
span: Span::new(0, 0),
atomic_state: AtomicState::None,
typesets: Vec::new(),
spans: Vec::new(),
events: std::collections::VecDeque::new(),
} }
} }
@ -149,8 +156,10 @@ impl<'s> Parser<'s> {
fn parse_event(&mut self) -> Option<Event> { fn parse_event(&mut self) -> Option<Event> {
self.reset_span(); self.reset_span();
self.eat().map(|first| { self.eat().map(|first| {
self.parse_verbatim(&first) self.atomic(&first)
.or_else(|| self.parse_container(&first)) .or_else(|| self.parse_verbatim(&first))
.or_else(|| self.parse_span(&first))
.or_else(|| self.parse_typeset(&first))
.or_else(|| self.parse_atom(&first)) .or_else(|| self.parse_atom(&first))
.unwrap_or(Event { .unwrap_or(Event {
kind: EventKind::Str, kind: EventKind::Str,
@ -159,37 +168,22 @@ impl<'s> Parser<'s> {
}) })
} }
fn parse_atom(&mut self, first: &lex::Token) -> Option<Event> { fn atomic(&mut self, first: &lex::Token) -> Option<Event> {
let atom = match first.kind { Some(match self.atomic_state {
lex::Kind::Newline => Softbreak, AtomicState::None => return None,
lex::Kind::Hardbreak => Hardbreak, AtomicState::Verbatim {
lex::Kind::Escape => Escape, kind,
lex::Kind::Nbsp => Nbsp, opener_len,
lex::Kind::Seq(lex::Sequence::Period) if first.len == 3 => Ellipsis, opener_event,
lex::Kind::Seq(lex::Sequence::Hyphen) if first.len == 2 => EnDash, } => {
lex::Kind::Seq(lex::Sequence::Hyphen) if first.len == 3 => EmDash,
_ => return None,
};
Some(Event {
kind: EventKind::Atom(atom),
span: self.span,
})
}
fn parse_verbatim(&mut self, first: &lex::Token) -> Option<Event> {
self.state
.verbatim()
.map(|(kind, opener_len, opener_event)| {
dbg!(&self.events, opener_event);
assert_eq!(self.events[opener_event].kind, EventKind::Enter(kind)); assert_eq!(self.events[opener_event].kind, EventKind::Enter(kind));
let kind = if matches!(first.kind, lex::Kind::Seq(lex::Sequence::Backtick)) let kind = if matches!(first.kind, lex::Kind::Seq(lex::Sequence::Backtick))
&& first.len == opener_len && first.len == opener_len
{ {
self.state = State::None; self.atomic_state = AtomicState::None;
let kind = let kind =
if matches!(kind, Verbatim) && self.lexer.peek_ahead().starts_with("{=") { if matches!(kind, Verbatim) && self.lexer.peek_ahead().starts_with("{=") {
let mut chars = self.lexer.peek_ahead()[2..].chars(); let mut chars = self.lexer.peek_ahead()["{=".len()..].chars();
let len = chars let len = chars
.clone() .clone()
.take_while(|c| !c.is_whitespace() && !matches!(c, '{' | '}')) .take_while(|c| !c.is_whitespace() && !matches!(c, '{' | '}'))
@ -215,8 +209,14 @@ impl<'s> Parser<'s> {
kind, kind,
span: self.span, span: self.span,
} }
}
AtomicState::Attributes { .. } => todo!(),
AtomicState::Url { .. } => todo!(),
AtomicState::ReferenceLinkTag => todo!(),
}) })
.or_else(|| { }
fn parse_verbatim(&mut self, first: &lex::Token) -> Option<Event> {
match first.kind { match first.kind {
lex::Kind::Seq(lex::Sequence::Dollar) => { lex::Kind::Seq(lex::Sequence::Dollar) => {
let math_opt = (first.len <= 2) let math_opt = (first.len <= 2)
@ -248,8 +248,7 @@ impl<'s> Parser<'s> {
_ => None, _ => None,
} }
.map(|(kind, opener_len)| { .map(|(kind, opener_len)| {
dbg!(&self.events); self.atomic_state = AtomicState::Verbatim {
self.state = State::Verbatim {
kind, kind,
opener_len, opener_len,
opener_event: self.events.len(), opener_event: self.events.len(),
@ -259,10 +258,41 @@ impl<'s> Parser<'s> {
span: self.span, span: self.span,
} }
}) })
}
fn parse_span(&mut self, first: &lex::Token) -> Option<Event> {
match first.kind {
lex::Kind::Open(Delimiter::Bracket) => Some(true),
lex::Kind::Close(Delimiter::Bracket) => Some(false),
_ => None,
}
.map(|open| {
if open {
self.spans.push(self.events.len());
// use str for now, replace if closed later
Event {
kind: EventKind::Str,
span: self.span,
}
} else {
if self.lexer.peek_ahead().starts_with('[') {
let mut chars = self.lexer.peek_ahead()["[".len()..].chars();
let len = chars
.clone()
.take_while(|c| !c.is_whitespace() && !matches!(c, '[' | ']'))
.count();
match chars.nth(len) {
Some(']') => todo!(),
None => self.atomic_state = AtomicState::ReferenceLinkTag,
_ => todo!(),
}
}
todo!()
}
}) })
} }
fn parse_container(&mut self, first: &lex::Token) -> Option<Event> { fn parse_typeset(&mut self, first: &lex::Token) -> Option<Event> {
enum Dir { enum Dir {
Open, Open,
Close, Close,
@ -276,8 +306,6 @@ impl<'s> Parser<'s> {
lex::Kind::Sym(Symbol::Tilde) => Some((Subscript, Dir::Both)), lex::Kind::Sym(Symbol::Tilde) => Some((Subscript, Dir::Both)),
lex::Kind::Sym(Symbol::Quote1) => Some((SingleQuoted, Dir::Both)), lex::Kind::Sym(Symbol::Quote1) => Some((SingleQuoted, Dir::Both)),
lex::Kind::Sym(Symbol::Quote2) => Some((DoubleQuoted, Dir::Both)), lex::Kind::Sym(Symbol::Quote2) => Some((DoubleQuoted, Dir::Both)),
lex::Kind::Open(Delimiter::Bracket) => Some((Span, Dir::Open)),
lex::Kind::Close(Delimiter::Bracket) => Some((Span, Dir::Close)),
lex::Kind::Open(Delimiter::BraceAsterisk) => Some((Strong, Dir::Open)), lex::Kind::Open(Delimiter::BraceAsterisk) => Some((Strong, Dir::Open)),
lex::Kind::Close(Delimiter::BraceAsterisk) => Some((Strong, Dir::Close)), lex::Kind::Close(Delimiter::BraceAsterisk) => Some((Strong, Dir::Close)),
lex::Kind::Open(Delimiter::BraceCaret) => Some((Superscript, Dir::Open)), lex::Kind::Open(Delimiter::BraceCaret) => Some((Superscript, Dir::Open)),
@ -295,19 +323,19 @@ impl<'s> Parser<'s> {
_ => None, _ => None,
} }
.map(|(cont, dir)| { .map(|(cont, dir)| {
self.openers self.typesets
.iter() .iter()
.rposition(|(c, _)| *c == cont) .rposition(|(c, _)| *c == cont)
.and_then(|o| { .and_then(|o| {
matches!(dir, Dir::Close | Dir::Both).then(|| { matches!(dir, Dir::Close | Dir::Both).then(|| {
let (_, e) = &mut self.openers[o]; let (_, e) = &mut self.typesets[o];
self.events[*e].kind = EventKind::Enter(cont); self.events[*e].kind = EventKind::Enter(cont);
self.openers.drain(o..); self.typesets.drain(o..);
EventKind::Exit(cont) EventKind::Exit(cont)
}) })
}) })
.unwrap_or_else(|| { .unwrap_or_else(|| {
self.openers.push((cont, self.events.len())); self.typesets.push((cont, self.events.len()));
// use str for now, replace if closed later // use str for now, replace if closed later
EventKind::Str EventKind::Str
}) })
@ -317,6 +345,24 @@ impl<'s> Parser<'s> {
span: self.span, span: self.span,
}) })
} }
fn parse_atom(&mut self, first: &lex::Token) -> Option<Event> {
let atom = match first.kind {
lex::Kind::Newline => Softbreak,
lex::Kind::Hardbreak => Hardbreak,
lex::Kind::Escape => Escape,
lex::Kind::Nbsp => Nbsp,
lex::Kind::Seq(lex::Sequence::Period) if first.len == 3 => Ellipsis,
lex::Kind::Seq(lex::Sequence::Hyphen) if first.len == 2 => EnDash,
lex::Kind::Seq(lex::Sequence::Hyphen) if first.len == 3 => EmDash,
_ => return None,
};
Some(Event {
kind: EventKind::Atom(atom),
span: self.span,
})
}
} }
impl<'s> Iterator for Parser<'s> { impl<'s> Iterator for Parser<'s> {
@ -325,8 +371,8 @@ impl<'s> Iterator for Parser<'s> {
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
let mut ready = true; let mut ready = true;
while self.events.is_empty() while self.events.is_empty()
|| !self.openers.is_empty() || !self.typesets.is_empty()
|| !matches!(self.state, State::None) || !matches!(self.atomic_state, AtomicState::None)
|| self // for merge || self // for merge
.events .events
.back() .back()
@ -334,7 +380,6 @@ impl<'s> Iterator for Parser<'s> {
{ {
if let Some(ev) = self.parse_event() { if let Some(ev) = self.parse_event() {
self.events.push_back(ev); self.events.push_back(ev);
dbg!(&self.events, &self.state);
} else { } else {
ready = false; ready = false;
break; break;
@ -366,8 +411,8 @@ impl<'s> Iterator for Parser<'s> {
} }
}) })
.or_else(|| { .or_else(|| {
self.state.verbatim().map(|(kind, _, _)| { self.atomic_state.verbatim().map(|(kind, _, _)| {
self.state = State::None; self.atomic_state = AtomicState::None;
Event { Event {
kind: EventKind::Exit(kind), kind: EventKind::Exit(kind),
span: self.span, span: self.span,
@ -490,7 +535,7 @@ mod test {
} }
#[test] #[test]
fn container_basic() { fn typeset_basic() {
test_parse!( test_parse!(
"_abc_", "_abc_",
(Enter(Emphasis), "_"), (Enter(Emphasis), "_"),
@ -506,7 +551,7 @@ mod test {
} }
#[test] #[test]
fn container_nest() { fn typeset_nest() {
test_parse!( test_parse!(
"{_{_abc_}_}", "{_{_abc_}_}",
(Enter(Emphasis), "{_"), (Enter(Emphasis), "{_"),
@ -526,12 +571,12 @@ mod test {
} }
#[test] #[test]
fn container_unopened() { fn typeset_unopened() {
test_parse!("*}abc", (Str, "*}abc")); test_parse!("*}abc", (Str, "*}abc"));
} }
#[test] #[test]
fn container_close_parent() { fn typeset_close_parent() {
test_parse!( test_parse!(
"{*{_abc*}", "{*{_abc*}",
(Enter(Strong), "{*"), (Enter(Strong), "{*"),
@ -541,7 +586,7 @@ mod test {
} }
#[test] #[test]
fn container_close_block() { fn typeset_close_block() {
test_parse!("{_abc", (Str, "{_abc")); test_parse!("{_abc", (Str, "{_abc"));
test_parse!("{_{*{_abc", (Str, "{_{*{_abc")); test_parse!("{_{*{_abc", (Str, "{_{*{_abc"));
} }