commit 453c48b686dc9808251fc1d23e31e0dcc05b9e30 Author: NGnius (Graham) Date: Thu May 30 20:03:56 2024 -0400 Create initial language parser diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..16caf9d --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,129 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "beef" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" + +[[package]] +name = "diff" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "logos" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "161971eb88a0da7ae0c333e1063467c5b5727e7fb6b710b8db4814eade3a42e8" +dependencies = [ + "logos-derive", +] + +[[package]] +name = "logos-codegen" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e31badd9de5131fdf4921f6473d457e3dd85b11b7f091ceb50e4df7c3eeb12a" +dependencies = [ + "beef", + "fnv", + "lazy_static", + "proc-macro2", + "quote", + "regex-syntax", + "syn", +] + +[[package]] +name = "logos-derive" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c2a69b3eb68d5bd595107c9ee58d7e07fe2bb5e360cc85b0f084dedac80de0a" +dependencies = [ + "logos-codegen", +] + +[[package]] +name = "muss2" +version = "0.1.0" + +[[package]] +name = "muss2-lang" +version = "0.1.0" +dependencies = [ + "logos", + "pretty_assertions", +] + +[[package]] +name = "pretty_assertions" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66" +dependencies = [ + "diff", + "yansi", +] + +[[package]] +name = "proc-macro2" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex-syntax" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" + +[[package]] +name = "syn" +version = "2.0.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "002a1b3dbf967edfafc32655d0f377ab0bb7b994aa1d32c8cc7e9b8bf3ebb8f0" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "yansi" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..ca3b045 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "muss2" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] + +[workspace] +members = [ + "crates/lang" +] diff --git a/crates/lang/Cargo.toml b/crates/lang/Cargo.toml new file mode 100644 index 0000000..9a0c1b3 --- /dev/null +++ b/crates/lang/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "muss2-lang" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +logos = { version = "0.14" } + +[dev-dependencies] +pretty_assertions = "1.3.0" diff --git a/crates/lang/src/lexer/errors.rs b/crates/lang/src/lexer/errors.rs new file mode 100644 index 0000000..94a7b92 --- /dev/null +++ b/crates/lang/src/lexer/errors.rs @@ -0,0 +1,16 @@ +#[derive(Debug, Default, PartialEq, Eq, Clone, Copy,)] +pub enum LexError { + #[default] + UnrecognizedToken, +} + +impl core::fmt::Display for LexError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + //use core::fmt::Write; + match self { + Self::UnrecognizedToken => write!(f, "Unrecognized token"), + } + } +} + +impl std::error::Error for LexError {} diff --git a/crates/lang/src/lexer/mod.rs b/crates/lang/src/lexer/mod.rs new file mode 100644 index 0000000..fe9762b --- /dev/null +++ b/crates/lang/src/lexer/mod.rs @@ -0,0 +1,68 @@ +mod errors; +pub use errors::LexError; + +mod tokens; +pub use tokens::{Token, TokenInfo}; + +#[cfg(test)] +mod test { + use super::*; + + use pretty_assertions::assert_eq; + + const ALL_TOKENS_STR: &str = "u n + - * / && || => x> ~> = . n_u_ :: is_a_ :: VaR1AbLe ( ) { } : ; -12345 12345.6789 \"char[]\" /* long\ncomment */ // short comment \n \n <>"; + + #[test] + fn parse_everything() { + let expected = vec![ + Token::Union(TokenInfo { line: 0, column: 0..1, index: 0..1 }), + Token::Intersection(TokenInfo { line: 0, column: 2..3, index: 2..3 }), + Token::Plus(TokenInfo { line: 0, column: 4..5, index: 4..5 }), + Token::Minus(TokenInfo { line: 0, column: 6..7, index: 6..7 }), + Token::Multiply(TokenInfo { line: 0, column: 8..9, index: 8..9 }), + Token::Divide(TokenInfo { line: 0, column: 10..11, index: 10..11 }), + Token::And(TokenInfo { line: 0, column: 12..14, index: 12..14 }), + Token::Or(TokenInfo { line: 0, column: 15..17, index: 15..17 }), + Token::Map(TokenInfo { line: 0, column: 18..20, index: 18..20 }), + Token::Filter(TokenInfo { line: 0, column: 21..23, index: 21..23 }), + Token::Sort(TokenInfo { line: 0, column: 24..26, index: 24..26 }), + Token::Equal(TokenInfo { line: 0, column: 27..28, index: 27..28 }), + Token::Dot(TokenInfo { line: 0, column: 29..30, index: 29..30 }), + Token::Variable(("n_u_".into(), TokenInfo { line: 0, column: 31..35, index: 31..35 })), + Token::PathSeparator(TokenInfo { line: 0, column: 36..38, index: 36..38 }), + Token::Variable(("is_a_".into(), TokenInfo { line: 0, column: 39..44, index: 39..44 })), + Token::PathSeparator(TokenInfo { line: 0, column: 45..47, index: 45..47 }), + Token::Variable(("VaR1AbLe".into(), TokenInfo { line: 0, column: 48..56, index: 48..56 })), + Token::OpenRoundBracket(TokenInfo { line: 0, column: 57..58, index: 57..58 }), + Token::CloseRoundBracket(TokenInfo { line: 0, column: 59..60, index: 59..60 }), + Token::OpenCurlyBracket(TokenInfo { line: 0, column: 61..62, index: 61..62 }), + Token::CloseCurlyBracket(TokenInfo { line: 0, column: 63..64, index: 63..64 }), + Token::Colon(TokenInfo { line: 0, column: 65..66, index: 65..66 }), + Token::Semicolon(TokenInfo { line: 0, column: 67..68, index: 67..68 }), + Token::Integer((-12345, TokenInfo { line: 0, column: 69..75, index: 69..75 })), + Token::Float((12345.6789, TokenInfo { line: 0, column: 76..86, index: 76..86 })), + Token::String(("char[]".into(), TokenInfo { line: 0, column: 87..95, index: 87..95 })), + Token::LongComment((" long\ncomment ".into(), TokenInfo { line: 0, column: 96..114, index: 96..114 })), + Token::ShortComment((" short comment ".into(), TokenInfo { line: 1, column: 11..29, index: 115..133 })), + Token::Newline(TokenInfo { line: 2, column: 1..2, index: 134..135 }), + Token::Generate(TokenInfo { line: 3, column: 1..3, index: 136..138 }), + ]; + + let mut actual = Vec::new(); + for (index, token_result) in Token::tokenify(ALL_TOKENS_STR).enumerate() { + assert!(token_result.is_ok(), "Token #{} (expected: {:?}) failed to parse: {:?}", index, expected[index], token_result.err()); + actual.push(token_result.unwrap()); + } + + assert_eq!(actual, expected) + } + + #[test] + fn parse_reversability() { + let expected = format!("{} ", ALL_TOKENS_STR); + + let actual = Token::stringify(Token::tokenify(&expected).map(|token_result| token_result.unwrap())); + + assert_eq!(actual, expected) + } +} diff --git a/crates/lang/src/lexer/tokens.rs b/crates/lang/src/lexer/tokens.rs new file mode 100644 index 0000000..60697dd --- /dev/null +++ b/crates/lang/src/lexer/tokens.rs @@ -0,0 +1,240 @@ +use logos::Logos; + +#[derive(Default, PartialEq, Clone)] +pub struct ExtraState { + line: usize, + line_start: usize, + start: usize, + end: usize, +} + +impl ExtraState { + fn lexer_sync(lex: &mut logos::Lexer) { + let span = lex.span(); + lex.extras.start = span.start; + lex.extras.end = span.end; + } + + fn newline(lex: &mut logos::Lexer) -> TokenInfo { + Self::lexer_sync(lex); + let info = lex.extras.token_info(); + lex.extras.line += 1; + lex.extras.line_start = lex.span().end; + info + } + + fn token_info(&self) -> TokenInfo { + TokenInfo { + line: self.line, + column: (self.start - self.line_start)..(self.end - self.line_start), + index: self.start..self.end, + } + } +} + +#[derive(Debug, PartialEq, Clone)] +pub struct TokenInfo { + pub line: usize, + pub column: core::ops::Range, + pub index: core::ops::Range, +} + +#[derive(Logos, Debug, PartialEq, Clone)] +#[logos(skip r"[ \t\f]+")] // Ignore this regex pattern between tokens +#[logos(error = super::LexError)] +#[logos(extras = ExtraState)] +pub enum Token { + // Operands + // Set operations + #[token("u", priority = 99, callback = all_cb)] + Union(TokenInfo), + #[token("n", priority = 99, callback = all_cb)] + Intersection(TokenInfo), + // Arithmetic operations (also applicable to sets) + #[token("+", callback = all_cb)] + Plus(TokenInfo), + #[token("-", callback = all_cb)] + Minus(TokenInfo), + #[token("*", callback = all_cb)] + Multiply(TokenInfo), + #[token("/", callback = all_cb)] + Divide(TokenInfo), + // Logical operations + #[token("&&", callback = all_cb)] + And(TokenInfo), + #[token("||", callback = all_cb)] + Or(TokenInfo), + + // Functional + #[token("=>", callback = all_cb)] + Map(TokenInfo), + #[token("x>", callback = all_cb)] + Filter(TokenInfo), + #[token("~>", callback = all_cb)] + Sort(TokenInfo), + #[token("<>", callback = all_cb)] + Generate(TokenInfo), + + // Declarations + + + // Basics + #[token("=", callback = all_cb)] + Equal(TokenInfo), + #[token("::", callback = all_cb)] + PathSeparator(TokenInfo), + #[token(".", callback = all_cb)] + Dot(TokenInfo), + #[regex("[a-zA-Z_][a-zA-Z_0-9]*", priority = 1, callback = variable_cb)] + Variable((String, TokenInfo)), + #[token("(", callback = all_cb)] + OpenRoundBracket(TokenInfo), + #[token(")", callback = all_cb)] + CloseRoundBracket(TokenInfo), + #[token("{", callback = all_cb)] + OpenCurlyBracket(TokenInfo), + #[token("}", callback = all_cb)] + CloseCurlyBracket(TokenInfo), + #[token(":", callback = all_cb)] + Colon(TokenInfo), + #[token(";", callback = all_cb)] + Semicolon(TokenInfo), + + // Literals + #[regex("-?[1-9][0-9]*", priority = 1, callback = integer_cb)] + Integer((i64, TokenInfo)), + #[regex("-?[1-9][0-9]*\\.[0-9]+", priority = 99, callback = float_cb)] + Float((f64, TokenInfo)), + #[regex(r#""([^"\\]|\\["\\bnfrt]|u[a-fA-F0-9]{4})*""#, priority = 1, callback = string_cb)] + String((String, TokenInfo)), + + /// Comments + #[regex(r#"\/\*([^\*]+(\*[^\/])?)*\*\/"#, priority = 1, callback = multiline_comment_cb)] + LongComment((String, TokenInfo)), + #[regex("\\/\\/[^\n]*\n", priority = 1, callback = oneline_comment_cb)] + ShortComment((String, TokenInfo)), + + /// Ignore + #[regex(r"\n", newline_cb)] + Newline(TokenInfo), +} + +fn all_cb(lex: &mut logos::Lexer) -> TokenInfo { + ExtraState::lexer_sync(lex); + lex.extras.token_info() +} + +fn variable_cb(lex: &mut logos::Lexer) -> (String, TokenInfo) { + let slice = lex.slice(); + (slice.to_owned(), all_cb(lex)) +} + +fn integer_cb(lex: &mut logos::Lexer) -> (i64, TokenInfo) { + let slice = lex.slice(); + (slice.parse().unwrap(), all_cb(lex)) +} + +fn float_cb(lex: &mut logos::Lexer) -> (f64, TokenInfo) { + let slice = lex.slice(); + (slice.parse().unwrap(), all_cb(lex)) +} + +fn string_cb(lex: &mut logos::Lexer) -> (String, TokenInfo) { + let slice = lex.slice(); + // TODO handle escaped chars + (slice[1..slice.len()-1].to_owned(), all_cb(lex)) +} + +fn multiline_comment_cb(lex: &mut logos::Lexer) -> (String, TokenInfo) { + let slice = lex.slice(); + let info = all_cb(lex); + for (i, c) in slice.chars().enumerate() { + if c == '\n' { + lex.extras.line += 1; + lex.extras.line_start = lex.span().start + i + 1; + } + } + (slice[2..slice.len()-2].to_owned(), info) +} + +fn oneline_comment_cb(lex: &mut logos::Lexer) -> (String, TokenInfo) { + let slice = lex.slice(); + let info = all_cb(lex); + lex.extras.line += 1; + lex.extras.line_start = lex.span().end; + (slice[2..slice.len()-1].to_owned(), info) +} + +fn newline_cb(lex: &mut logos::Lexer) -> TokenInfo { + ExtraState::newline(lex) +} + +impl Token { + pub fn tokenify<'a>(s: &'a str) -> logos::Lexer<'a, Self> { + Token::lexer(s) + } + + pub fn stringify<'a>(tokens: impl core::iter::Iterator + 'a) -> String { + use core::fmt::Write; + let mut result = String::new(); + tokens.for_each(|t| { + t.write_str(&mut result).unwrap(); + write!(result, " ").unwrap(); + }); + result + } + + pub fn stringify_ref<'a, 'b>(tokens: impl core::iter::Iterator + 'a) -> String { + use core::fmt::Write; + let mut result = String::new(); + tokens.for_each(|t| { + t.write_str(&mut result).unwrap(); + write!(result, " ").unwrap(); + }); + result + } + + pub fn as_str(&self) -> String { + let mut s = String::new(); + self.write_str(&mut s).unwrap(); + s + } + + pub fn write_str(&self, result: &mut String) -> std::fmt::Result { + use core::fmt::Write; + match self { + Self::Union(_) => write!(result, "u"), + Self::Intersection(_) => write!(result, "n"), + Self::Plus(_) => write!(result, "+"), + Self::Minus(_) => write!(result, "-"), + Self::Multiply(_) => write!(result, "*"), + Self::Divide(_) => write!(result, "/"), + Self::And(_) => write!(result, "&&"), + Self::Or(_) => write!(result, "||"), + Self::Map(_) => write!(result, "=>"), + Self::Filter(_) => write!(result, "x>"), + Self::Sort(_) => write!(result, "~>"), + Self::Generate(_) => write!(result, "<>"), + Self::Equal(_) => write!(result, "="), + Self::PathSeparator(_) => write!(result, "::"), + Self::Dot(_) => write!(result, "."), + Self::Variable((name, _)) => write!(result, "{}", name), + Self::OpenRoundBracket(_) => write!(result, "("), + Self::CloseRoundBracket(_) => write!(result, ")"), + Self::OpenCurlyBracket(_) => write!(result, "{{"), + Self::CloseCurlyBracket(_) => write!(result, "}}"), + Self::Colon(_) => write!(result, ":"), + Self::Semicolon(_) => write!(result, ";"), + Self::Integer((int, _)) => write!(result, "{}", int), + Self::Float((float, _)) => write!(result, "{}", float), + Self::String((s, _)) => write!(result, "\"{}\"", s), + Self::LongComment((c, _)) => write!(result, "/*{}*/", c), + Self::ShortComment((c, _)) => write!(result, "//{}\n", c), + Self::Newline(_) => write!(result, "\n"), + } + } + + pub fn is_ignore(&self) -> bool { + matches!(self, Self::Newline(_)) + } +} diff --git a/crates/lang/src/lib.rs b/crates/lang/src/lib.rs new file mode 100644 index 0000000..39348d9 --- /dev/null +++ b/crates/lang/src/lib.rs @@ -0,0 +1,7 @@ +//! Language specification +//! +//! Parsing order: lexer -> syntax -> statement + +pub mod lexer; +pub mod statement; +pub mod syntax; diff --git a/crates/lang/src/statement/errors.rs b/crates/lang/src/statement/errors.rs new file mode 100644 index 0000000..c1d12c8 --- /dev/null +++ b/crates/lang/src/statement/errors.rs @@ -0,0 +1,32 @@ +#[derive(Debug, PartialEq, Clone)] +pub enum LanguageError { + InvalidSequence(Vec), + InvalidSyntax(Vec), + UnexpectedEnd(Vec), + UnexpectedToken(crate::syntax::SyntaxToken), + UnrecognizedToken, +} + +impl core::fmt::Display for LanguageError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + //use core::fmt::Write; + match self { + Self::InvalidSequence(seq) => write!(f, "Invalid sequence {:?}", seq.as_slice()), + Self::InvalidSyntax(seq) => write!(f, "Invalid syntax {}", crate::syntax::SyntaxToken::stringify_ref(seq.iter())), + Self::UnexpectedEnd(seq) => write!(f, "Unexpected end of file {} ", crate::syntax::SyntaxToken::stringify_ref(seq.iter())), + Self::UnexpectedToken(token) => write!(f, "Unexpected token {}", token.as_str()), + Self::UnrecognizedToken => write!(f, "Unrecognized token"), + } + } +} + +impl std::error::Error for LanguageError {} + +impl From for LanguageError { + fn from(value: crate::syntax::SyntaxError) -> Self { + match value { + crate::syntax::SyntaxError::UnrecognizedToken => Self::UnrecognizedToken, + crate::syntax::SyntaxError::InvalidSequence(seq) => Self::InvalidSequence(seq), + } + } +} diff --git a/crates/lang/src/statement/mod.rs b/crates/lang/src/statement/mod.rs new file mode 100644 index 0000000..ed20c24 --- /dev/null +++ b/crates/lang/src/statement/mod.rs @@ -0,0 +1,133 @@ +//! High-level language +mod errors; +pub use errors::LanguageError; + +mod parser; +pub use parser::LanguageParser; + +mod tree; +pub use tree::{Statement, Notification, Param, Declare, DeclareFun, DeclareType, Module, Op, DeclareAssignVar, DeclareVar, AssignVar, Dyadic, CallFun}; + +// my_namespace { <>generate_fn()=>map_fn()x>filter_fn(_)~>sort_fn(_) } + +#[cfg(test)] +mod test { + use super::*; + + use pretty_assertions::assert_eq; + + fn assert_no_errors(iter: impl Iterator>) -> Vec { + let mut statements = Vec::new(); + for (i, res) in iter.enumerate() { + match res { + Ok(statement) => if !statement.is_ignore() { statements.push(statement); }, + Err(e) => { + let e_display = e.to_string(); + match e { + LanguageError::InvalidSequence(_seq) => {}, + LanguageError::InvalidSyntax(seq) => { + let bad_syntax = seq.last().expect("Empty invalid syntax token sequence"); + let bad_lex = bad_syntax.info.last().expect("Empty token info on syntax token"); + + eprintln!( + "{} @ line {}, column {} to {} (index {} to {}), token #{}", + e_display, bad_lex.line, + bad_lex.column.start, bad_lex.column.end, + bad_lex.index.start, bad_lex.index.end, i + ); + }, + LanguageError::UnexpectedEnd(seq) => { + let bad_syntax = seq.last().expect("Empty unexpected end token sequence"); + let bad_lex = bad_syntax.info.last().expect("Empty token info on syntax token"); + + eprintln!( + "{} @ line {}, column {} to {} (index {} to {}), token #{}", + e_display, bad_lex.line, + bad_lex.column.start, bad_lex.column.end, + bad_lex.index.start, bad_lex.index.end, i + ); + }, + LanguageError::UnexpectedToken(bad_syntax) => { + let bad_lex = bad_syntax.info.last().expect("Empty token info on syntax token"); + + eprintln!( + "{} @ line {}, column {} to {} (index {} to {}), token #{}", + e_display, bad_lex.line, + bad_lex.column.start, bad_lex.column.end, + bad_lex.index.start, bad_lex.index.end, i + ); + }, + LanguageError::UnrecognizedToken => { + eprintln!("Unrecognized token #{} ?!?!", i); + } + } + panic!("{} for token #{}", e_display, i); + } + } + } + statements + } + + #[test] + fn parse_minimum_module() { + let parser = LanguageParser::lex("my_module {}"); + + let parsed = assert_no_errors(parser); + assert_eq!(vec![ + Statement::Module(Module { + name: crate::syntax::Path(vec![ + "my_module".into(), + ]), + inner: Vec::new(), + }), + ], parsed); + } + + #[test] + fn parse_minimum_function_declaration() { + let parser = LanguageParser::lex("my_generator () <> {}"); + + let parsed = assert_no_errors(parser); + assert_eq!(vec![ + Statement::Declare(Declare::Function(DeclareFun { + name: crate::syntax::Path(vec![ + "my_generator".into(), + ]), + params: Vec::new(), + type_: crate::syntax::Functional::Generate, + ops: Vec::new(), + })) + ], parsed); + } + + #[test] + fn parse_minimum_type_declaration() { + let parser = LanguageParser::lex("my_type = {}"); + + let parsed = assert_no_errors(parser); + assert_eq!(vec![ + Statement::Declare(Declare::Type(DeclareType { + name: crate::syntax::Path(vec![ + "my_type".into(), + ]), + params: Vec::new(), + })) + ], parsed); + } + + #[test] + fn parse_minimum_entrypoint() { + let parser = LanguageParser::lex("<> my_generator ()"); + + let parsed = assert_no_errors(parser); + assert_eq!(vec![ + Statement::Entrypoint(CallFun { + type_: crate::syntax::Functional::Generate, + var: crate::syntax::Path(vec![ + "my_generator".into(), + ]), + params: Vec::new(), + }) + ], parsed); + } +} diff --git a/crates/lang/src/statement/parser.rs b/crates/lang/src/statement/parser.rs new file mode 100644 index 0000000..9e5a237 --- /dev/null +++ b/crates/lang/src/statement/parser.rs @@ -0,0 +1,769 @@ +pub struct LanguageParser<'a, I: core::iter::Iterator> + 'a> { + _idc: core::marker::PhantomData<&'a ()>, + iter: I, + lookahead: Option, + incomplete_modules: Vec, +} + +impl <'a, I: core::iter::Iterator> + 'a> LanguageParser<'a, I> { + pub fn new(tokens_in: I) -> Self { + Self { + _idc: Default::default(), + iter: tokens_in, + lookahead: None, + incomplete_modules: Vec::new(), + } + } + + fn parse_incomplete_module(&mut self, name: crate::syntax::Path) -> Result { + Ok(super::Module { name, inner: Vec::new() }) + } + + fn parse_function_decl(&mut self, name: crate::syntax::Path) -> Result { + // `Name (` (first 2 tokens) are already consumed + let mut lookahead = if let Some(lookahead) = self.lookahead.take() { + lookahead + } else { + match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + } + }; + // function params + let mut params = Vec::new(); + while !matches!(lookahead.token, crate::syntax::Token::CloseRoundBracket) { + self.lookahead = Some(lookahead); + params.push(self.parse_param()?); + lookahead = match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + }; + } + // function type + let token1 = match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + }; + let fn_type = if let crate::syntax::Token::Functional(fn_type) = token1.token { + fn_type + } else { + return Err(super::LanguageError::InvalidSyntax(vec![token1])); + }; + // operations + let token2 = match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + }; + if !matches!(token2.token, crate::syntax::Token::OpenCurlyBracket) { + return Err(super::LanguageError::InvalidSyntax(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Functional(fn_type), + info: token1.info, + }, + token2 + ])); + } + lookahead = match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + }; + let mut ops = Vec::new(); + while !matches!(lookahead.token, crate::syntax::Token::CloseCurlyBracket) { + self.lookahead = Some(lookahead); + ops.push(self.parse_op()?); + lookahead = match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + }; + } + Ok(super::DeclareFun { + name, + params, + type_: fn_type, + ops, + }) + } + + fn parse_type_decl(&mut self, name: crate::syntax::Path) -> Result { + // `Name =` (first 2 tokens) are already consumed + let token0 = if let Some(lookahead) = self.lookahead.take() { + lookahead + } else { + match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + } + }; + match token0.token { + crate::syntax::Token::OpenCurlyBracket => { + let mut lookahead = match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + }; + let mut params = Vec::new(); + while !matches!(lookahead.token, crate::syntax::Token::CloseCurlyBracket) { + self.lookahead = Some(lookahead); + params.push(self.parse_param()?); + lookahead = match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + }; + } + Ok(super::DeclareType { + name, + params, + }) + } + t => Err(super::LanguageError::InvalidSyntax(vec![ + crate::syntax::SyntaxToken { + token: t, + info: token0.info, + } + ])) + } + } + + fn parse_function_call(&mut self, fun: crate::syntax::Functional) -> Result { + // `Func` (first token) is already consumed + let token0 = if let Some(lookahead) = self.lookahead.take() { + lookahead + } else { + match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + } + }; + let name = if let crate::syntax::Token::Path(name) = token0.token { + name + } else { + return Err(super::LanguageError::UnexpectedToken(token0)); + }; + let token1 = match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + }; + if !matches!(token1.token, crate::syntax::Token::OpenRoundBracket) { + return Err(super::LanguageError::UnexpectedToken(token1)); + } + let op_params = match self.parse_op_params() { + Ok(ops) => ops, + Err(e) => { + return Err(Self::extend_err_tokens(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(name), + info: token0.info, + }, + crate::syntax::SyntaxToken { + token: crate::syntax::Token::OpenRoundBracket, + info: token1.info, + }, + ], e)); + } + }; + let token_last = if let Some(lookahead) = self.lookahead.take() { + lookahead + } else { + match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + } + }; + if !matches!(token_last.token, crate::syntax::Token::CloseRoundBracket) { + return Err(super::LanguageError::UnexpectedToken(token1)); + } + Ok(super::CallFun { + type_: fun, + var: name, + params: op_params, + }) + } + + fn parse_op(&mut self) -> Result { + let op = self.parse_inner_op(0)?; + let token_last = if let Some(lookahead) = self.lookahead.take() { + lookahead + } else { + match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + } + }; + if let crate::syntax::Token::Semicolon = token_last.token { + Ok(op) + } else { + Err(super::LanguageError::UnexpectedToken(token_last)) + } + } + + fn extend_err_tokens(mut tokens: Vec, err: super::LanguageError) -> super::LanguageError { + match err { + super::LanguageError::InvalidSyntax(mut seq) => { + tokens.append(&mut seq); + super::LanguageError::InvalidSyntax(tokens) + }, + super::LanguageError::UnexpectedEnd(mut seq) => { + tokens.append(&mut seq); + super::LanguageError::UnexpectedEnd(tokens) + } + e => e + } + } + + fn parse_inner_op(&mut self, recursion_level: usize) -> Result { + let token0 = if let Some(lookahead) = self.lookahead.take() { + lookahead + } else { + match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + } + }; + let op0 = match token0.token { + crate::syntax::Token::Path(var_name) => { + // variable-oriented operations + let token1 = match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(var_name), + info: token0.info, + } + ])), + }; + match token1.token { + crate::syntax::Token::Colon => { + // Declare-assign or declare + let mut lookahead = match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(var_name), + info: token0.info, + }, + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Colon, + info: token1.info, + }, + ])), + }; + let type_name = if let crate::syntax::Token::Path(type_name) = lookahead.token { + let type_token_info = lookahead.info; + lookahead = match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(var_name), + info: token0.info, + }, + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Colon, + info: token1.info, + }, + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(type_name), + info: type_token_info, + }, + ])), + }; + Some((type_name, type_token_info)) + } else { + None + }; + if let crate::syntax::Token::Equal = lookahead.token { + // Declare-Assign + let inner_op = match self.parse_inner_op(recursion_level + 1) { + Ok(op) => op, + Err(e) => { + // roughly equivalent to self.parse_inner_op(...).map_err(|e| { ... }) + // (the closure captures variables which the compiler can't prove aren't used in this fn after) + let tokens = if let Some((type_name, type_token_info)) = type_name { + vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(var_name), + info: token0.info, + }, + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Colon, + info: token1.info, + }, + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(type_name), + info: type_token_info, + }, + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Equal, + info: lookahead.info, + }, + ] + } else { + vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(var_name), + info: token0.info, + }, + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Colon, + info: token1.info, + }, + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Equal, + info: lookahead.info, + }, + ] + }; + return Err(Self::extend_err_tokens(tokens, e)); + } + }; + super::Op::DeclareAssign(super::tree::DeclareAssignVar { + var: var_name, + type_: type_name.map(|x| x.0), + op: Box::new(inner_op), + }) + } else { + // declare + self.lookahead = Some(lookahead); + super::Op::Declare(super::tree::DeclareVar { + var: var_name, + type_: type_name.map(|x| x.0), + }) + } + }, + crate::syntax::Token::Equal => { + // Assign + let token2 = match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(var_name), + info: token0.info, + }, + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Equal, + info: token1.info, + } + ])), + }; + if let crate::syntax::Token::Field(f) = token2.token { + let inner_op = self.parse_inner_op(recursion_level + 1)?; + super::Op::Assign(crate::statement::tree::AssignVar { + var: var_name, + field: Some(f), + op: Box::new(inner_op), + }) + } else { + self.lookahead = Some(token2); + let inner_op = self.parse_inner_op(recursion_level + 1)?; + super::Op::Assign(crate::statement::tree::AssignVar { + var: var_name, + field: None, + op: Box::new(inner_op), + }) + } + }, + /*crate::syntax::Token::OpenRoundBracket => { + // Call + let op_params = match self.parse_op_params() { + Ok(ops) => ops, + Err(e) => { + return Err(Self::extend_err_tokens(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(var_name), + info: token0.info, + }, + crate::syntax::SyntaxToken { + token: crate::syntax::Token::OpenRoundBracket, + info: token1.info, + }, + ], e)); + } + }; + self.lookahead.take().unwrap(); // always a closing round bracket; no need to verify + let token_filter = match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(var_name), + info: token0.info, + }, + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Equal, + info: token1.info, + }, + // TODO include tokens from op_params + ])), + }; + if let crate::syntax::Token::Functional(crate::syntax::Functional::Filter) = token_filter.token { + let filter_op = match self.parse_inner_op(recursion_level + 1) { + Ok(x) => x, + Err(e) => { + return Err(Self::extend_err_tokens(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(var_name), + info: token0.info, + }, + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Equal, + info: token1.info, + }, + // TODO include tokens from op_params + ], e)) + } + }; + super::Op::Call(super::CallVar { + var: var_name, + params: op_params, + }) + } else { + return Err(super::LanguageError::UnexpectedToken(token_filter)); + } + }*/ + t => { + // Retrieve + self.lookahead = Some(crate::syntax::SyntaxToken { + token: t, + info: token1.info, + }); + super::Op::Retrieve(var_name) + } + } + }, + crate::syntax::Token::Functional(fun) => { + // Call + match self.parse_function_call(fun.clone()) { + Ok(x) => super::Op::Call(x), + Err(e) => { + return Err(Self::extend_err_tokens(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Functional(fun), + info: token0.info, + } + ], e)); + } + } + } + crate::syntax::Token::Operation(unary_op) => { + // Unary operation + let inner_op = match self.parse_inner_op(recursion_level + 1) { + Ok(op) => op, + Err(e) => { + return Err(Self::extend_err_tokens(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Operation(unary_op), + info: token0.info, + } + ], e)); + } + }; + super::Op::Unary(super::tree::Unary { + first: Box::new(inner_op), + op: unary_op, + }) + }, + crate::syntax::Token::Literal(literal) => { + super::Op::Literal(literal) + }, + crate::syntax::Token::OpenRoundBracket => { + // Operation surrounded by brackets + let inner_op = match self.parse_inner_op(recursion_level + 1) { + Ok(op) => op, + Err(e) => { + return Err(Self::extend_err_tokens(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::OpenRoundBracket, + info: token0.info, + } + ], e)); + } + }; + let token_last = if let Some(lookahead) = self.lookahead.take() { + lookahead + } else { + match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + // TODO include all tokens from inner_op + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + } + }; + if let crate::syntax::Token::CloseRoundBracket = token_last.token { + super::Op::Bracketed(Box::new(inner_op)) + } else { + // TODO maybe? include all tokens from inner_op + return Err(super::LanguageError::UnexpectedToken(token_last)); + } + } + t => { + return Err(super::LanguageError::InvalidSyntax(vec![ + crate::syntax::SyntaxToken { + token: t, + info: token0.info, + } + ])); + } + }; + + // check if operation continues (i.e. is dyadic) + let lookahead = if let Some(lookahead) = self.lookahead.take() { + lookahead + } else { + match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + } + }; + if let crate::syntax::Token::Operation(dyadic_op) = lookahead.token { + let op1 = match self.parse_inner_op(recursion_level + 1) { + Ok(op) => op, + Err(e) => { + return Err(Self::extend_err_tokens(vec![ + // TODO add tokens of op0 too + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Operation(dyadic_op), + info: token0.info, + } + ], e)); + } + }; + Ok(super::Op::Dyadic(super::Dyadic { + first: Box::new(op0), + op: dyadic_op, + second: Box::new(op1), + })) + } else { + self.lookahead = Some(lookahead); + Ok(op0) + } + } + + fn parse_op_params(&mut self) -> Result, super::LanguageError> { + let mut lookahead = if let Some(lookahead) = self.lookahead.take() { + lookahead + } else { + match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + } + }; + let mut ops = Vec::new(); + while !matches!(lookahead.token, crate::syntax::Token::CloseRoundBracket) { + self.lookahead = Some(lookahead); + ops.push(self.parse_op()?); + lookahead = if let Some(lookahead) = self.lookahead.take() { + lookahead + } else { + match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + // TODO add tokens of previous op(s) + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + } + }; + } + self.lookahead = Some(lookahead); + Ok(ops) + } + + // [!] no unhandled lookaheads + fn parse_param(&mut self) -> Result { + let token0 = if let Some(lookahead) = self.lookahead.take() { + lookahead + } else { + match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(Vec::new())), + } + }; + if let crate::syntax::Token::Path(var_name) = token0.token { + let token1 = match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(var_name), + info: token0.info, + } + ])), + }; + match token1.token { + crate::syntax::Token::Colon => { + // with type declaration + let token2 = match self.iter.next() { + Some(Err(e)) => return Err(e.into()), + Some(Ok(t)) => t, + None => return Err(super::LanguageError::UnexpectedEnd(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(var_name), + info: token0.info, + }, + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Colon, + info: token1.info, + } + ])), + }; + if let crate::syntax::Token::Path(ty_name) = token2.token { + Ok(super::Param { + name: var_name, + type_: Some(ty_name), + }) + } else { + Err(super::LanguageError::InvalidSyntax(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(var_name), + info: token0.info, + }, + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Colon, + info: token1.info, + }, + token2, + ])) + } + }, + crate::syntax::Token::Semicolon => { + // without type declaration + Ok(super::Param { + name: var_name, + type_: None, + }) + }, + t => Err(super::LanguageError::InvalidSyntax(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(var_name), + info: token0.info, + }, + crate::syntax::SyntaxToken { + token: t, + info: token1.info, + } + ])) + } + } else { + Err(super::LanguageError::InvalidSyntax(vec![token0])) + } + } +} + +impl <'a> LanguageParser<'a, crate::syntax::TokenParser<'a, logos::Lexer<'a, crate::lexer::Token>>> { + pub fn lex(s: &'a str) -> Self { + Self::new(crate::syntax::TokenParser::new(crate::lexer::Token::tokenify(s))) + } +} + +impl <'a, I: core::iter::Iterator> + 'a> core::iter::Iterator for LanguageParser<'a, I> { + type Item = Result; + + fn next(&mut self) -> Option { + let opt_next_token = if let Some(token) = self.lookahead.take() { + Some(token) + } else { + match self.iter.next() { + Some(Err(e)) => return Some(Err(e.into())), + Some(Ok(t)) => Some(t), + None => None, + } + }; + if let Some(token) = opt_next_token { + let statement = match token.token { + crate::syntax::Token::Path(p0) => { + let next_token = match self.iter.next() { + Some(Err(e)) => return Some(Err(e.into())), + Some(Ok(t)) => t, + None => return Some(Err(super::LanguageError::UnexpectedEnd(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(p0), + info: token.info, + } + ]))), + }; + match next_token.token { + crate::syntax::Token::OpenCurlyBracket => { + // module + match self.parse_incomplete_module(p0.clone()) { + Ok(module) => self.incomplete_modules.push(module), + Err(e) => return Some(Err(e)), + } + // skip capturing by model by immediately returning + return Some(Ok(super::Statement::Notification(super::Notification::EnteringModule(p0)))); + }, + crate::syntax::Token::OpenRoundBracket => { + // function declaration + match self.parse_function_decl(p0) { + Ok(fn_decl) => super::Statement::Declare(super::Declare::Function(fn_decl)), + Err(e) => return Some(Err(e)), + } + }, + crate::syntax::Token::Equal => { + // type declaration + match self.parse_type_decl(p0) { + Ok(ty_decl) => super::Statement::Declare(super::Declare::Type(ty_decl)), + Err(e) => return Some(Err(e)), + } + }, + unrecognized => return Some(Err(super::LanguageError::InvalidSyntax(vec![ + crate::syntax::SyntaxToken { + token: crate::syntax::Token::Path(p0), + info: token.info, + }, + crate::syntax::SyntaxToken { + token: unrecognized, + info: next_token.info, + } + ]))), + } + }, + crate::syntax::Token::Functional(fun) => { + match self.parse_function_call(fun) { + Ok(f) => super::Statement::Entrypoint(f), + Err(e) => return Some(Err(e)) + } + } + crate::syntax::Token::CloseCurlyBracket => { + if let Some(module) = self.incomplete_modules.pop() { + super::Statement::Module(module) + } else { + return Some(Err(super::LanguageError::UnexpectedToken(crate::syntax::SyntaxToken { + token: crate::syntax::Token::CloseCurlyBracket, + info: token.info + }))); + } + } + t => return Some(Err(super::LanguageError::UnexpectedToken(crate::syntax::SyntaxToken { + token: t, + info: token.info + }))), + }; + if let Some(mut module) = self.incomplete_modules.pop() { + module.inner.push(statement.clone()); + self.incomplete_modules.push(module); + Some(Ok(super::Statement::Notification(super::Notification::CapturedByModule(Box::new(statement))))) + } else { + Some(Ok(statement)) + } + } else { + None + } + } +} diff --git a/crates/lang/src/statement/tree.rs b/crates/lang/src/statement/tree.rs new file mode 100644 index 0000000..e02d932 --- /dev/null +++ b/crates/lang/src/statement/tree.rs @@ -0,0 +1,176 @@ +/// Statement declaration +/// +/// Statement -> Declare +/// Statement -> Module +/// Statement -> CallFun [entrypoint] +/// +/// Statements -> Statement Statements +/// Statements -> DONE +#[derive(Debug, PartialEq, Clone)] +pub enum Statement { + Declare(Declare), + Module(Module), + Entrypoint(CallFun), + Notification(Notification), +} + +impl Statement { + pub fn is_ignore(&self) -> bool { + matches!(self, Self::Notification(_)) + } +} + +/// Fake tokens emitted by parser to avoid excessive recursion +#[derive(Debug, PartialEq, Clone)] +pub enum Notification { + EnteringModule(crate::syntax::Path), + CapturedByModule(Box), +} + +/// Param declaration +/// +/// Param -> Variable: Type +/// OR (depending on context) +/// Param -> Variable +/// +/// Params -> Param; Params [semicolon-separated] +/// Params -> ; +/// Params -> Param +#[derive(Debug, PartialEq, Clone)] +pub struct Param { + pub name: crate::syntax::Path, + pub type_: Option, +} + +/// Function or Type declaration +/// +/// Declare -> DeclareFun +/// Declare -> DeclareType +#[derive(Debug, PartialEq, Clone)] +pub enum Declare { + Function(DeclareFun), + Type(DeclareType), +} + +/// Function declaration +/// +/// DeclareFun -> Name (Params) Func { Ops } +/// Func -> => [map] +/// Func -> x> [filter] +/// Func -> ~> [sort] +/// Func -> <> [generator] +#[derive(Debug, PartialEq, Clone)] +pub struct DeclareFun { + pub name: crate::syntax::Path, + pub params: Vec, + pub type_: crate::syntax::Functional, + pub ops: Vec, +} + +/// Type declaration +/// +/// DeclareType -> Name = { Params } +#[derive(Debug, PartialEq, Clone)] +pub struct DeclareType { + pub name: crate::syntax::Path, + pub params: Vec, +} + +/// Module declaration +/// +/// Module -> Variable { Statements } +#[derive(Debug, PartialEq, Clone)] +pub struct Module { + pub name: crate::syntax::Path, + pub inner: Vec, +} + +/// Operation declaration +/// +/// Op -> Variable := Op [declare-assign] +/// Op -> Variable: Type [declare] +/// Op -> Variable = Op [assign] +/// Op -> Variable Fields [retrieve] +/// Op -> CallFun [invoke] +/// Op -> Op DualOp Op [dyadic] +/// Op -> UnaryOp Op [unary] +/// Op -> Literal [literal] +/// Op -> Bracketed [bracketed] +/// Fields -> +/// Fields -> .Name Fields +/// DualOp -> SetOp +/// DualOp -> n +/// DualOp -> u +/// DualOp -> + +/// DualOp -> - +/// DualOp -> * +/// DualOp -> / +/// DualOp -> && +/// DualOp -> || +/// SetOp -> n +/// SetOp -> u +/// UnaryOp -> - +/// Literal -> "Name" +/// Literal -> Integer +/// Literal -> Float +/// Bracketed -> (Op) +/// +/// +/// Ops -> Op; Ops [semicolon-separated] +/// Ops -> ; +/// Ops -> DONE +#[derive(Debug, PartialEq, Clone)] +pub enum Op { + DeclareAssign(DeclareAssignVar), + Declare(DeclareVar), + Assign(AssignVar), + Retrieve(crate::syntax::Path), + Call(CallFun), + Dyadic(Dyadic), + Unary(Unary), + Literal(crate::syntax::Literal), + Bracketed(Box), +} + +#[derive(Debug, PartialEq, Clone)] +pub struct DeclareAssignVar { + pub var: crate::syntax::Path, + pub type_: Option, + pub op: Box, +} + +#[derive(Debug, PartialEq, Clone)] +pub struct DeclareVar { + pub var: crate::syntax::Path, + pub type_: Option, +} + +#[derive(Debug, PartialEq, Clone)] +pub struct AssignVar { + pub var: crate::syntax::Path, + pub field: Option, + pub op: Box, +} + +#[derive(Debug, PartialEq, Clone)] +pub struct Dyadic { + pub first: Box, + pub op: crate::syntax::Op, + pub second: Box, +} + +#[derive(Debug, PartialEq, Clone)] +pub struct Unary { + pub first: Box, + pub op: crate::syntax::Op, +} + +/// Function call declaration +/// +/// CallFun -> Func Name (Params) +#[derive(Debug, PartialEq, Clone)] +pub struct CallFun { + pub type_: crate::syntax::Functional, + pub var: crate::syntax::Path, + pub params: Vec, +} diff --git a/crates/lang/src/syntax/errors.rs b/crates/lang/src/syntax/errors.rs new file mode 100644 index 0000000..9e94ee7 --- /dev/null +++ b/crates/lang/src/syntax/errors.rs @@ -0,0 +1,25 @@ +#[derive(Debug, PartialEq, Clone)] +pub enum SyntaxError { + InvalidSequence(Vec), + UnrecognizedToken, +} + +impl core::fmt::Display for SyntaxError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + //use core::fmt::Write; + match self { + Self::InvalidSequence(seq) => write!(f, "Invalid sequence {:?}", seq.as_slice()), + Self::UnrecognizedToken => write!(f, "Unrecognized token"), + } + } +} + +impl std::error::Error for SyntaxError {} + +impl From for SyntaxError { + fn from(value: crate::lexer::LexError) -> Self { + match value { + crate::lexer::LexError::UnrecognizedToken => Self::UnrecognizedToken, + } + } +} diff --git a/crates/lang/src/syntax/mod.rs b/crates/lang/src/syntax/mod.rs new file mode 100644 index 0000000..4b5bd46 --- /dev/null +++ b/crates/lang/src/syntax/mod.rs @@ -0,0 +1,66 @@ +//! High-level syntax +mod errors; +pub use errors::SyntaxError; + +mod parser; +pub(crate) use parser::TokenParser; + +mod tokens; +pub use tokens::{SyntaxToken, Token, Literal, Op, Functional, Field, Path, Comment}; + +#[cfg(test)] +mod test { + use super::*; + + use pretty_assertions::assert_eq; + + const ALL_TOKENS_STR: &str = "u n + - * / && || => x> ~> <> = n_u_::is_a_::VaR1AbLe .th1s.is_A_.f13Ld ( ) { } ; -404 -1234.5 \"\" /* block comment */ // line comment \n"; + + #[test] + fn parse_everything() { + let expected = vec![ + Token::Operation(Op::Union), + Token::Operation(Op::Intersection), + Token::Operation(Op::Plus), + Token::Operation(Op::Minus), + Token::Operation(Op::Multiply), + Token::Operation(Op::Divide), + Token::Operation(Op::And), + Token::Operation(Op::Or), + Token::Functional(Functional::Map), + Token::Functional(Functional::Filter), + Token::Functional(Functional::Sort), + Token::Functional(Functional::Generate), + Token::Equal, + Token::Path(Path(vec!["n_u_".into(), "is_a_".into(), "VaR1AbLe".into()])), + Token::Field(Field(vec!["th1s".into(), "is_A_".into(), "f13Ld".into()])), + Token::OpenRoundBracket, + Token::CloseRoundBracket, + Token::OpenCurlyBracket, + Token::CloseCurlyBracket, + Token::Semicolon, + Token::Literal(Literal::Integer(-404)), + Token::Literal(Literal::Float(-1234.5)), + Token::Literal(Literal::String("".into())), + Token::Comment(Comment::Block(" block comment ".into())), + Token::Comment(Comment::Line(" line comment ".into())), + ]; + + let mut actual = Vec::new(); + for (index, token_result) in SyntaxToken::tokenify(ALL_TOKENS_STR).enumerate() { + assert!(token_result.is_ok(), "Token #{} (expected: {:?}) failed to parse: {:?}", index, expected[index], token_result.err()); + actual.push(token_result.unwrap().token); + } + + assert_eq!(actual, expected) + } + + #[test] + fn parse_reversability() { + let expected = format!("{} ", ALL_TOKENS_STR); + + let actual = SyntaxToken::stringify(SyntaxToken::tokenify(&expected).map(|token_result| token_result.unwrap())); + + assert_eq!(actual, expected) + } +} diff --git a/crates/lang/src/syntax/parser.rs b/crates/lang/src/syntax/parser.rs new file mode 100644 index 0000000..2496502 --- /dev/null +++ b/crates/lang/src/syntax/parser.rs @@ -0,0 +1,184 @@ +pub(crate) struct TokenParser<'a, I: core::iter::Iterator> + 'a> { + _idc: core::marker::PhantomData<&'a ()>, + iter: I, + lookahead: Option, +} + +impl <'a, I: core::iter::Iterator> + 'a> TokenParser<'a, I> { + pub fn new(tokens_in: I) -> Self { + Self { + _idc: Default::default(), + iter: tokens_in, + lookahead: None, + } + } + + fn rebuild_field_token_sequence(parts: Vec, infos: &mut Vec>) -> Vec { + parts.into_iter() + .flat_map(|var| [crate::lexer::Token::Dot(Self::take_first_some(infos).unwrap()), crate::lexer::Token::Variable((var, Self::take_first_some(infos).unwrap()))]) + .collect() + } + + fn rebuild_path_token_sequence(parts: Vec, infos: &mut Vec>) -> Vec { + let mut tokens: Vec<_> = parts.into_iter() + .flat_map(|var| [crate::lexer::Token::Variable((var, Self::take_first_some(infos).unwrap())), crate::lexer::Token::PathSeparator(Self::take_first_some(infos).unwrap())]) + .collect(); + if !tokens.is_empty() { + // remove trailing path separator + tokens.pop(); + } + tokens + } + + fn take_first_some(items: &mut Vec>) -> Option { + for i in items.iter_mut() { + if let Some(item) = i.take() { + return Some(item) + } + } + None + } +} + +impl <'a, I: core::iter::Iterator> + 'a> core::iter::Iterator for TokenParser<'a, I> { + type Item = Result; + + #[inline] + fn next(&mut self) -> Option { + let opt_next_lex_token = if let Some(lex_token) = self.lookahead.take() { + Some(lex_token) + } else { + match self.iter.next() { + Some(Err(e)) => return Some(Err(e.into())), + Some(Ok(t)) => Some(t), + None => None, + } + }; + if let Some(lex_token) = opt_next_lex_token { + let translated = match lex_token { + crate::lexer::Token::Union(info) => super::Token::Operation(super::Op::Union).with(info), + crate::lexer::Token::Intersection(info) => super::Token::Operation(super::Op::Intersection).with(info), + crate::lexer::Token::Plus(info) => super::Token::Operation(super::Op::Plus).with(info), + crate::lexer::Token::Minus(info) => super::Token::Operation(super::Op::Minus).with(info), + crate::lexer::Token::Multiply(info) => super::Token::Operation(super::Op::Multiply).with(info), + crate::lexer::Token::Divide(info) => super::Token::Operation(super::Op::Divide).with(info), + crate::lexer::Token::And(info) => super::Token::Operation(super::Op::And).with(info), + crate::lexer::Token::Or(info) => super::Token::Operation(super::Op::Or).with(info), + crate::lexer::Token::Map(info) => super::Token::Functional(super::Functional::Map).with(info), + crate::lexer::Token::Filter(info) => super::Token::Functional(super::Functional::Filter).with(info), + crate::lexer::Token::Sort(info) => super::Token::Functional(super::Functional::Sort).with(info), + crate::lexer::Token::Generate(info) => super::Token::Functional(super::Functional::Generate).with(info), + crate::lexer::Token::Equal(info) => super::Token::Equal.with(info), + crate::lexer::Token::PathSeparator(info) => return Some(Err(super::SyntaxError::InvalidSequence(vec![crate::lexer::Token::PathSeparator(info)]))), + crate::lexer::Token::Dot(info) => { + // read all incoming dots and variable combos into a single path token + // e.g. [Dot, Variable("x"), Dot, Variable("y"), Dot, Variable("z")] becomes Field(["x", "y", "z"]) + let mut parts = Vec::new(); + let mut infos = Vec::new(); + infos.push(Some(info)); + loop { + let next_lex_token = match self.iter.next() { + Some(Err(e)) => return Some(Err(e.into())), + Some(Ok(t)) => Some(t), + None => None, + }; + if let Some(next_lex_token) = next_lex_token { + match next_lex_token { + crate::lexer::Token::Variable((part, info)) => { + parts.push(part); + infos.push(Some(info)); + }, + invalid_token => { + let last_dot = crate::lexer::Token::Dot(infos.pop().unwrap().unwrap()); + let mut sequence = Self::rebuild_field_token_sequence(parts, &mut infos); + sequence.push(last_dot); + sequence.push(invalid_token); + return Some(Err(super::SyntaxError::InvalidSequence(sequence))); + } + } + } else { + let last_dot = crate::lexer::Token::Dot(infos.pop().unwrap().unwrap()); + let mut sequence = Self::rebuild_field_token_sequence(parts, &mut infos); + sequence.push(last_dot); + return Some(Err(super::SyntaxError::InvalidSequence(sequence))); + } + self.lookahead = match self.iter.next() { + Some(Err(e)) => return Some(Err(e.into())), + Some(Ok(t)) => Some(t), + None => None, + }; + if let Some(crate::lexer::Token::Dot(info)) = &self.lookahead { + infos.push(Some(info.to_owned())); + self.lookahead = None; + } else { + break; + } + } + super::SyntaxToken { + token: super::Token::Field(super::Field(parts)), + info: infos.into_iter().map(|x| x.unwrap()).collect(), + } + } + crate::lexer::Token::Variable((root, info)) => { + // read all incoming path separators and variable combos into a single path token + // e.g. [Variable("x"), PathSeparator, Variable("y"), PathSeparator, Variable("z")] becomes Path(["x", "y", "z"]) + self.lookahead = match self.iter.next() { + Some(Err(e)) => return Some(Err(e.into())), + Some(Ok(t)) => Some(t), + None => None, + }; + let mut parts = Vec::new(); + let mut infos = Vec::new(); + parts.push(root); + infos.push(Some(info)); + while let Some(crate::lexer::Token::PathSeparator(path_info)) = &self.lookahead { + infos.push(Some(path_info.to_owned())); + let next_lex_token = match self.iter.next() { + Some(Err(e)) => return Some(Err(e.into())), + Some(Ok(t)) => Some(t), + None => None, + }; + if let Some(crate::lexer::Token::Variable((part, info))) = next_lex_token { + parts.push(part); + infos.push(Some(info)); + } else { + let last_sep = crate::lexer::Token::PathSeparator(infos.pop().unwrap().unwrap()); + let mut sequence = Self::rebuild_path_token_sequence(parts, &mut infos); + sequence.push(last_sep); + if let Some(lex_token) = next_lex_token { + sequence.push(lex_token); + return Some(Err(super::SyntaxError::InvalidSequence(sequence))) + } else { + return Some(Err(super::SyntaxError::InvalidSequence(sequence))); + } + } + self.lookahead = match self.iter.next() { + Some(Err(e)) => return Some(Err(e.into())), + Some(Ok(t)) => Some(t), + None => None, + }; + } + super::SyntaxToken { + token: super::Token::Path(super::Path(parts)), + info: infos.into_iter().map(|x| x.unwrap()).collect(), + } + }, + crate::lexer::Token::OpenRoundBracket(info) => super::Token::OpenRoundBracket.with(info), + crate::lexer::Token::CloseRoundBracket(info) => super::Token::CloseRoundBracket.with(info), + crate::lexer::Token::OpenCurlyBracket(info) => super::Token::OpenCurlyBracket.with(info), + crate::lexer::Token::CloseCurlyBracket(info) => super::Token::CloseCurlyBracket.with(info), + crate::lexer::Token::Colon(info) => super::Token::Colon.with(info), + crate::lexer::Token::Semicolon(info) => super::Token::Semicolon.with(info), + crate::lexer::Token::Integer((int, info)) => super::Token::Literal(super::Literal::Integer(int)).with(info), + crate::lexer::Token::Float((float, info)) => super::Token::Literal(super::Literal::Float(float)).with(info), + crate::lexer::Token::String((s, info)) => super::Token::Literal(super::Literal::String(s)).with(info), + crate::lexer::Token::LongComment((c, info)) => super::Token::Comment(super::tokens::Comment::Block(c)).with(info), + crate::lexer::Token::ShortComment((c, info)) => super::Token::Comment(super::tokens::Comment::Line(c)).with(info), + crate::lexer::Token::Newline(_) => panic!("Got non-ignored newline"), + }; + Some(Ok(translated)) + } else { + None + } + } +} diff --git a/crates/lang/src/syntax/tokens.rs b/crates/lang/src/syntax/tokens.rs new file mode 100644 index 0000000..95498c5 --- /dev/null +++ b/crates/lang/src/syntax/tokens.rs @@ -0,0 +1,188 @@ +#[derive(Debug, PartialEq, Clone)] +pub struct SyntaxToken { + pub token: Token, + pub info: Vec, +} + +#[derive(Debug, PartialEq, Clone)] +pub enum Token { + Operation(Op), + Functional(Functional), + + // Basics + Equal, + Field(Field), + Path(Path), + OpenRoundBracket, + CloseRoundBracket, + OpenCurlyBracket, + CloseCurlyBracket, + Colon, + Semicolon, + + Literal(Literal), + Comment(Comment), +} + +impl Token { + pub(super) fn with(self, info: crate::lexer::TokenInfo) -> SyntaxToken { + SyntaxToken { token: self, info: vec![info] } + } +} + +#[derive(Debug, PartialEq, Clone)] +pub struct Field(pub Vec); + +#[derive(Debug, PartialEq, Clone)] +pub struct Path(pub Vec); + +#[derive(Debug, PartialEq, Clone)] +pub enum Op { + // Set operations + Union, + Intersection, + // Arithmetic operations (also applicable to sets) + Plus, + Minus, + Multiply, + Divide, + // Logical operations + And, + Or, +} + +impl Op { + pub fn as_str(&self) -> &'static str { + match self { + Self::Union => "u", + Self::Intersection => "n", + Self::Plus => "+", + Self::Minus => "-", + Self::Multiply => "*", + Self::Divide => "/", + Self::And => "&&", + Self::Or => "||", + } + } +} + +#[derive(Debug, PartialEq, Clone)] +pub enum Functional { + Map, + Filter, + Sort, + Generate, +} + +impl Functional { + pub fn as_str(&self) -> &'static str { + match self { + Self::Map => "=>", + Self::Filter => "x>", + Self::Sort => "~>", + Self::Generate => "<>", + } + } +} + +#[derive(Debug, PartialEq, Clone)] +pub enum Literal { + Integer(i64), + Float(f64), + String(String), +} + +impl Literal { + pub fn as_str(&self) -> String { + match self { + Self::Integer(int) => format!("{}", int), + Self::Float(float) => format!("{}", float), + Self::String(s) => format!("\"{}\"", s), + } + } +} + +#[derive(Debug, PartialEq, Clone)] +pub enum Comment { + Line(String), + Block(String), +} + +impl Comment { + pub fn as_str(&self) -> String { + match self { + Self::Line(comment) => format!("//{}\n", comment), + Self::Block(comment) => format!("/*{}*/", comment), + } + } +} + +impl Token { + pub fn write_str(&self, result: &mut String) -> std::fmt::Result { + use core::fmt::Write; + match self { + Self::Operation(op) => write!(result, "{}", op.as_str()), + Self::Functional(fun) => write!(result, "{}", fun.as_str()), + Self::Equal => write!(result, "="), + Self::Field(parts) => { + for p in parts.0.iter() { + write!(result, ".{}", p).unwrap(); + } + Ok(()) + } + Self::Path(parts) => { + for (i, p) in parts.0.iter().enumerate() { + write!(result, "{}", p).unwrap(); + if i != parts.0.len() - 1 { + write!(result, "::").unwrap(); + } + } + Ok(()) + } + Self::OpenRoundBracket => write!(result, "("), + Self::CloseRoundBracket => write!(result, ")"), + Self::OpenCurlyBracket => write!(result, "{{"), + Self::CloseCurlyBracket => write!(result, "}}"), + Self::Colon => write!(result, ":"), + Self::Semicolon => write!(result, ";"), + Self::Literal(l) => write!(result, "{}", l.as_str()), + Self::Comment(c) => write!(result, "{}", c.as_str()), + } + } +} + +impl SyntaxToken { + pub fn tokenify<'a>(s: &'a str) -> impl core::iter::Iterator> + 'a { + super::TokenParser::new(crate::lexer::Token::tokenify(s)) + } + + pub fn stringify<'a>(tokens: impl core::iter::Iterator + 'a) -> String { + use core::fmt::Write; + let mut result = String::new(); + tokens.for_each(|t| { + t.write_str(&mut result).unwrap(); + write!(result, " ").unwrap(); + }); + result + } + + pub fn stringify_ref<'a, 'b>(tokens: impl core::iter::Iterator + 'a) -> String { + use core::fmt::Write; + let mut result = String::new(); + tokens.for_each(|t| { + t.write_str(&mut result).unwrap(); + write!(result, " ").unwrap(); + }); + result + } + + pub fn as_str(&self) -> String { + let mut s = String::new(); + self.write_str(&mut s).unwrap(); + s + } + + pub fn write_str(&self, result: &mut String) -> std::fmt::Result { + self.token.write_str(result) + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..e7a11a9 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,3 @@ +fn main() { + println!("Hello, world!"); +}