From 6db55dba1186ac3d06b3286e4db24484b20af40c Mon Sep 17 00:00:00 2001 From: Volodymyr Herashchenko Date: Thu, 18 Dec 2025 11:12:03 +0200 Subject: [PATCH] add lexer The lexer parses incoming code into tokens, which makes it simpler to process using `chumsky`. --- Cargo.lock | 113 +++++++++++++++++++++++++- Cargo.toml | 1 + src/lexer.rs | 223 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + 4 files changed, 334 insertions(+), 4 deletions(-) create mode 100644 src/lexer.rs diff --git a/Cargo.lock b/Cargo.lock index 78ba77bb..dafc305f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "anstream" version = "0.6.18" @@ -61,6 +67,15 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "ar_archive_writer" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0c269894b6fe5e9d7ada0cf69b5bf847ff35bc25fc271f08e1d080fce80339a" +dependencies = [ + "object", +] + [[package]] name = "arbitrary" version = "1.1.3" @@ -181,12 +196,14 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "cc" -version = "1.0.83" +version = "1.2.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +checksum = "90583009037521a116abf44494efecd645ba48b6622457080f080b85544e2215" dependencies = [ + "find-msvc-tools", "jobserver", "libc", + "shlex", ] [[package]] @@ -195,6 +212,20 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chumsky" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acc17a6284abccac6e50db35c1cee87f605474a72939b959a3a67d9371800efd" +dependencies = [ + "hashbrown", + "regex-automata", + "serde", + "stacker", + "unicode-ident", + "unicode-segmentation", +] + [[package]] name = "clap" version = "4.5.37" @@ -292,6 +323,24 @@ dependencies = [ "secp256k1-zkp", ] +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "find-msvc-tools" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "generic-array" version = "0.14.7" @@ -321,6 +370,17 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8449d342b1c67f49169e92e71deb7b9b27f30062301a16dbc27a4cc8d2351b7" +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] + [[package]] name = "hex-conservative" version = "0.2.1" @@ -377,9 +437,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.147" +version = "0.2.178" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091" [[package]] name = "libfuzzer-sys" @@ -413,6 +473,15 @@ dependencies = [ "bitcoin", ] +[[package]] +name = "object" +version = "0.32.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" version = "1.21.3" @@ -479,6 +548,16 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "psm" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d11f2fedc3b7dafdc2851bc52f277377c5473d378859be234bc7ebb593144d01" +dependencies = [ + "ar_archive_writer", + "cc", +] + [[package]] name = "quote" version = "1.0.33" @@ -646,6 +725,12 @@ dependencies = [ "digest", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "simplicity-lang" version = "0.6.0" @@ -680,6 +765,7 @@ version = "0.3.0" dependencies = [ "arbitrary", "base64 0.21.3", + "chumsky", "clap", "either", "getrandom", @@ -704,6 +790,19 @@ dependencies = [ "simplicityhl", ] +[[package]] +name = "stacker" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1f8b29fb42aafcea4edeeb6b2f2d7ecd0d969c48b4cf0d2e64aafc471dd6e59" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys", +] + [[package]] name = "strsim" version = "0.11.1" @@ -770,6 +869,12 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + [[package]] name = "utf8parse" version = "0.2.2" diff --git a/Cargo.toml b/Cargo.toml index f14ab530..4e8378b0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,6 +33,7 @@ either = "1.12.0" itertools = "0.13.0" arbitrary = { version = "1", optional = true, features = ["derive"] } clap = "4.5.37" +chumsky = "0.11.2" [target.wasm32-unknown-unknown.dependencies] getrandom = { version = "0.2", features = ["js"] } diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 00000000..d8bb14ef --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,223 @@ +use chumsky::prelude::*; +use std::fmt; + +pub type Span = SimpleSpan; +pub type Spanned = (T, Span); + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum Token<'src> { + // Keywords + Fn, + Let, + Type, + Mod, + Const, + Match, + + // Control symbols + Arrow, + Colon, + Semi, + Comma, + Eq, + FatArrow, + LParen, + RParen, + LBracket, + RBracket, + LBrace, + RBrace, + AngleOpen, + AngleClose, + + // Number literals + DecLiteral(&'src str), + HexLiteral(&'src str), + BinLiteral(&'src str), + + // Boolean literal + Bool(bool), + + // Identifier + Ident(&'src str), + + // Jets, witnesses, and params + Jet(&'src str), + Witness(&'src str), + Param(&'src str), + + // Built-in types (List, Option, Either) + BuiltinType(&'src str), + + // Unsigned integer types + UnsignedType(&'src str), + + // Boolean type + BooleanType, + + // Built-in functions + BuiltinFn(&'src str), + + // Built-in aliases + BuiltinAlias(&'src str), + + // Comments and block comments + // + // We would discard them for the compiler, but they are needed, for example, for the formatter. + Comment, + BlockComment, +} + +impl<'src> fmt::Display for Token<'src> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Token::Fn => write!(f, "fn"), + Token::Let => write!(f, "let"), + Token::Type => write!(f, "type"), + Token::Mod => write!(f, "mod"), + Token::Const => write!(f, "const"), + Token::Match => write!(f, "match"), + + Token::Arrow => write!(f, "->"), + Token::Colon => write!(f, ":"), + Token::Semi => write!(f, ";"), + Token::Comma => write!(f, ","), + Token::Eq => write!(f, "="), + Token::FatArrow => write!(f, "=>"), + Token::LParen => write!(f, "("), + Token::RParen => write!(f, ")"), + Token::LBracket => write!(f, "["), + Token::RBracket => write!(f, "]"), + Token::LBrace => write!(f, "{{"), + Token::RBrace => write!(f, "}}"), + Token::AngleOpen => write!(f, "<"), + Token::AngleClose => write!(f, ">"), + + Token::DecLiteral(s) | Token::HexLiteral(s) | Token::BinLiteral(s) => { + write!(f, "{}", s) + } + + Token::Ident(s) => write!(f, "{}", s), + + Token::Jet(s) => write!(f, "jet::{}", s), + Token::Witness(s) => write!(f, "witness::{}", s), + Token::Param(s) => write!(f, "param::{}", s), + + Token::BuiltinType(s) => write!(f, "{}", s), + Token::UnsignedType(s) => write!(f, "{}", s), + Token::BuiltinFn(s) => write!(f, "{}", s), + Token::BuiltinAlias(s) => write!(f, "{}", s), + + Token::BooleanType => write!(f, "bool"), + Token::Bool(b) => write!(f, "{}", b), + + Token::Comment => write!(f, "comment"), + Token::BlockComment => write!(f, "block_comment"), + } + } +} + +fn lexer<'src>( +) -> impl Parser<'src, &'src str, Vec>>, extra::Err>> { + let num = text::int(10).map(Token::DecLiteral); + let hex = just("0x").ignore_then(text::int(16)).map(Token::HexLiteral); + let bin = just("0b").ignore_then(text::int(2)).map(Token::BinLiteral); + + let macros = choice((just("assert!"), just("panic!"), just("dbg!"))).map(Token::BuiltinFn); + + let keyword = text::ident().map(|s| match s { + "fn" => Token::Fn, + "let" => Token::Let, + "type" => Token::Type, + "mod" => Token::Mod, + "const" => Token::Const, + "match" => Token::Match, + "true" => Token::Bool(true), + "false" => Token::Bool(false), + "List" | "Either" | "Option" => Token::BuiltinType(s), + "u1" | "u2" | "u4" | "u8" | "u16" | "u32" | "u64" | "u128" | "u256" => { + Token::UnsignedType(s) + } + "bool" => Token::BooleanType, + "unwrap_left" | "unwrap_right" | "array_fold" | "for_while" | "is_none" | "unwrap" + | "into" | "fold" => Token::BuiltinFn(s), + "Ctx8" | "Pubkey" | "Message64" | "Message" | "Signature" | "Scalar" | "Fe" | "Gej" + | "Ge" | "Point" | "Height" | "Time" | "Distance" | "Duration" | "Lock" | "Outpoint" + | "Confidential1" | "ExplicitAsset" | "Asset1" | "ExplicitAmount" | "Amount1" + | "ExplicitNonce" | "Nonce" | "TokenAmount1" => Token::BuiltinAlias(s), + _ => Token::Ident(s), + }); + + let jet = just("jet::") + .ignore_then(text::ident()) + .map(Token::Jet) + .labelled("jet"); + let witness = just("witness::") + .labelled("witness") + .ignore_then(text::ident()) + .map(Token::Witness); + let param = just("param::") + .ignore_then(text::ident()) + .map(Token::Param) + .labelled("param"); + + let op = choice(( + just("->").to(Token::Arrow), + just("=>").to(Token::FatArrow), + just("=").to(Token::Eq), + just(":").to(Token::Colon), + just(";").to(Token::Semi), + just(",").to(Token::Comma), + just("(").to(Token::LParen), + just(")").to(Token::RParen), + just("[").to(Token::LBracket), + just("]").to(Token::RBracket), + just("{").to(Token::LBrace), + just("}").to(Token::RBrace), + just("<").to(Token::AngleOpen), + just(">").to(Token::AngleClose), + )); + + let comment = just("//") + .ignore_then(any().and_is(just('\n').not()).repeated()) + .to(Token::Comment); + + let block_comment = just("/*") + .ignore_then(just("*/").not().then(any()).repeated()) + .then_ignore(just("*/")) + .to(Token::BlockComment); + + let token = choice(( + comment, + block_comment, + jet, + witness, + param, + macros, + keyword, + hex, + bin, + num, + op, + )); + + token + .map_with(|tok, e| (tok, e.span())) + .padded() + .recover_with(skip_then_retry_until(any().ignored(), end())) + .repeated() + .collect() +} + +#[test] +fn lexer_test() { + use chumsky::prelude::*; + + // Check if the lexer parses the example file without errors. + let src = include_str!("../examples/last_will.simf"); + + let (tokens, lex_errs) = lexer().parse(src).into_output_errors(); + let _ = tokens.unwrap(); + + assert!(lex_errs.is_empty()); +} diff --git a/src/lib.rs b/src/lib.rs index ad5aadfc..e1077ca2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,6 +7,7 @@ pub mod debug; pub mod dummy_env; pub mod error; pub mod jet; +pub mod lexer; pub mod named; pub mod num; pub mod parse;