Language Builder

This end-to-end guide walks you through building a complete new programming language with CDTk, from token declarations through to an SDXF bundle that the UAB engine can compile.

Overview

We'll build ArrowLang — a small functional language that compiles to Python. Features: function declarations (fn), let bindings, return, and arrow-typed expressions.

Step 1 — Create the Project

dotnet new classlib -n ArrowLang --framework net10.0
cd ArrowLang
dotnet add reference ../CSharp.CDTk/CSharp.CDTk.csproj

Step 2 — Declare Tokens & Roles

Create ArrowLangGrammar.cs. Declare all tokens as public static fields, most-specific first, and add a Map Structural for role assignments:

using CDTk;

public class ArrowLangGrammar : Grammar {
    // Keywords (most-specific, declared first)
    public static Token KW_FN     = Kw("fn");
    public static Token KW_LET    = Kw("let");
    public static Token KW_RETURN = Kw("return");
    public static Token KW_IF     = Kw("if");
    public static Token KW_ELSE   = Kw("else");
    public static Token KW_INT    = Kw("int");
    public static Token KW_VOID   = Kw("void");

    // Multi-char operators first
    public static Token ARROW   = Op("->");
    public static Token OP_EQ   = Op("==");
    public static Token OP_NEQ  = Op("!=");
    public static Token OP_LEQ  = Op("<=");
    public static Token PLUS    = Op("+");
    public static Token MINUS   = Op("-");
    public static Token STAR    = Op("*");
    public static Token ASSIGN  = Op("=");
    public static Token LT      = Op("<");

    // Delimiters
    public static Token LBRACE  = Punct("{");
    public static Token RBRACE  = Punct("}");
    public static Token LPAREN  = Punct("(");
    public static Token RPAREN  = Punct(")");
    public static Token COMMA   = Punct(",");
    public static Token SEMI    = Punct(";");
    public static Token COLON   = Punct(":");

    // Catch-all (lowest priority)
    public static Token IDENT   = Id();
    public static Token INT     = Num();
    public static Token STR     = Str();

    // Structural roles — auto-applied by CDTk
    public static Map Structural = new() {
        { KW_FN,     "FuncKeyword"   },
        { KW_RETURN, "ReturnKeyword" },
        { KW_IF,     "IfKeyword"     },
        { KW_ELSE,   "ElseKeyword"   },
        { KW_INT,    "TypeKeyword"   },
        { KW_VOID,   "TypeKeyword"   },
    };
}

Step 3 — Define Grammar Rules

Add rules as public static Rule fields. Use Ref() for recursive or forward references, and set RootRule:

    // Expression (right-recursive)
    public static Rule ExprRule = Alt(
        Seq(IDENT, PLUS,  Ref(() => ExprRule)),
        Seq(IDENT, MINUS, Ref(() => ExprRule)),
        Seq(IDENT, STAR,  Ref(() => ExprRule)),
        Seq(IDENT, OP_EQ, Ref(() => ExprRule)),
        Seq(IDENT, LT,    Ref(() => ExprRule)),
        Seq(LPAREN, Ref(() => ExprRule), RPAREN),
        INT,
        IDENT
    );

    // Let binding: let x = expr;
    public static Rule LetStmt = Seq(KW_LET, IDENT, ASSIGN, ExprRule, SEMI);

    // Return statement: return expr;
    public static Rule RetStmt = Seq(KW_RETURN, ExprRule, SEMI);

    // If/else
    public static Rule IfStmt = Seq(
        KW_IF, LPAREN, ExprRule, RPAREN, Ref(() => Block),
        Opt(Seq(KW_ELSE, Ref(() => Block)))
    );

    // Statement
    public static Rule Statement = Alt(LetStmt, RetStmt, IfStmt);

    // Block: { stmt* }
    public static Rule Block = Seq(LBRACE, Rep(Statement), RBRACE);

    // Parameter: type name
    public static Rule Param = Seq(Alt(KW_INT, IDENT), IDENT);
    public static Rule Params = Opt(Seq(Param, Rep(Seq(COMMA, Param))));

    // Function: fn name(params): rettype { body }
    public static Rule FnDecl = Seq(
        KW_FN, IDENT, LPAREN, Params, RPAREN,
        Opt(Seq(ARROW, Alt(KW_INT, KW_VOID, IDENT))),
        Block
    );

    // Program is one or more function declarations
    public static Rule Program = Rep1(FnDecl);

    public override Rule RootRule => Program;

Step 4 — Implement Render()

Override Render() to emit Python from the translated SemanticTable:

    public override string Render(SemanticTable table) {
        var sb = new StringBuilder();
        foreach (var fn in table.Morphisms) {
            // Python def with comma-separated params
            sb.AppendLine($"def {fn.Name}({fn.Domain}):");
            // Indent body lines
            foreach (var line in fn.Body.Split('\n',
                         StringSplitOptions.RemoveEmptyEntries))
                sb.AppendLine($"    {line.Trim()}");
            sb.AppendLine();
        }
        return sb.ToString();
    }

Step 5 — Wire Up the Compiler

Create a Program.cs that reads ArrowLang source and translates it to Python:

using CDTk;

var source = File.ReadAllText("hello.arrow");
var python = Compiler.CompileText(
    new ArrowLangGrammar(),
    new PythonGrammar(),
    source
);
Console.Write(python);

Step 6 — Bundle as SDXF

To distribute your language for use with the UAB engine without shipping C# DLLs, serialize the grammar pair as an SDXF binary:

// Serialize input + output grammar to SDXF binary
var encoder = new SdfxEncoder(new ArrowLangGrammar(), new PythonGrammar());
byte[] bundle = encoder.Encode(sourceFiles);
File.WriteAllBytes("arrowlang.sdxf", bundle);

// Or just get the grammar bytes for embedding:
byte[] grammarBytes = Compiler.EncodeGrammarSdfx(new ArrowLangGrammar());

Step 7 — Test the Pipeline

Write a smoke test. CDTk's built-in testing infrastructure makes it easy to verify round-trips:

// Input ArrowLang source
var input = """
fn add(int a, int b) -> int {
    return a + b;
}
""";

// Translate to Python
var py = Compiler.CompileText(new ArrowLangGrammar(), new PythonGrammar(), input);

// Verify output contains expected Python constructs
Debug.Assert(py.Contains("def add"));
Debug.Assert(py.Contains("return a + b"));
Console.WriteLine("✓ ArrowLang → Python");
Console.WriteLine(py);

Complete Grammar Class

You're done!
Your grammar class needs: (1) token fields, (2) a Structural map, (3) rule fields with a RootRule, and (4) a Render() override. CDTk handles everything else — lexing, parsing, semantic table building, and translation.

The full source is available on GitHub in the Testing/Grammars/ directory as examples for C#, Python, WASM, and LLVM IR. Each grammar is ≈200 lines.