typst/src/syntax/tests.rs

#![allow(non_snake_case)]

use std::num::NonZeroUsize;
use std::sync::Arc;

use super::*;
use crate::geom::{AbsUnit, AngleUnit};

use ErrorPos::*;
use Option::None;
use SyntaxKind::*;
use TokenMode::{Code, Markup};

use std::fmt::Debug;

#[track_caller]
pub fn check<T>(text: &str, found: T, expected: T)
where
    T: Debug + PartialEq,
{
    if found != expected {
        println!("source:   {text:?}");
        println!("expected: {expected:#?}");
        println!("found:    {found:#?}");
        panic!("test failed");
    }
}

fn Space(newlines: usize) -> SyntaxKind {
    SyntaxKind::Space { newlines }
}

fn Raw(text: &str, lang: Option<&str>, block: bool) -> SyntaxKind {
    SyntaxKind::Raw(Arc::new(RawFields {
        text: text.into(),
        lang: lang.map(Into::into),
        block,
    }))
}

fn Str(string: &str) -> SyntaxKind {
    SyntaxKind::Str(string.into())
}

fn Text(string: &str) -> SyntaxKind {
    SyntaxKind::Text(string.into())
}

fn Ident(ident: &str) -> SyntaxKind {
    SyntaxKind::Ident(ident.into())
}

fn Error(pos: ErrorPos, message: &str) -> SyntaxKind {
    SyntaxKind::Error(pos, message.into())
}

/// Building blocks for suffix testing.
///
/// We extend each test case with a collection of different suffixes to make
/// sure tokens end at the correct position. These suffixes are split into
/// blocks, which can be disabled/enabled per test case. For example, when
/// testing identifiers we disable letter suffixes because these would
/// mingle with the identifiers.
///
/// Suffix blocks:
/// - ' ': spacing
/// - 'a': letters
/// - '1': numbers
/// - '/': symbols
const BLOCKS: &str = " a1/";

// Suffixes described by four-tuples of:
//
// - block the suffix is part of
// - mode in which the suffix is applicable
// - the suffix string
// - the resulting suffix NodeKind
fn suffixes() -> impl Iterator<Item = (char, Option<TokenMode>, &'static str, SyntaxKind)>
{
    [
        // Whitespace suffixes.
        (' ', None, " ", Space(0)),
        (' ', None, "\n", Space(1)),
        (' ', None, "\r", Space(1)),
        (' ', None, "\r\n", Space(1)),
        // Letter suffixes.
        ('a', Some(Markup), "hello", Text("hello")),
        ('a', Some(Markup), "💚", Text("💚")),
        ('a', Some(Code), "val", Ident("val")),
        ('a', Some(Code), "α", Ident("α")),
        ('a', Some(Code), "_", Ident("_")),
        // Number suffixes.
        ('1', Some(Code), "2", Int(2)),
        ('1', Some(Code), ".2", Float(0.2)),
        // Symbol suffixes.
        ('/', None, "[", LeftBracket),
        ('/', None, "//", LineComment),
        ('/', None, "/**/", BlockComment),
        ('/', Some(Markup), "*", Star),
        ('/', Some(Markup), r"\\", Escape('\\')),
        ('/', Some(Markup), "#let", Let),
        ('/', Some(Code), "(", LeftParen),
        ('/', Some(Code), ":", Colon),
        ('/', Some(Code), "+=", PlusEq),
    ]
    .into_iter()
}

macro_rules! t {
    (Both $($tts:tt)*) => {
        t!(Markup $($tts)*);
        t!(Code $($tts)*);
    };
    ($mode:ident $([$blocks:literal])?: $text:expr => $($token:expr),*) => {{
        // Test without suffix.
        t!(@$mode: $text => $($token),*);

        // Test with each applicable suffix.
        for (block, mode, suffix, ref token) in suffixes() {
            let text = $text;
            #[allow(unused_variables)]
            let blocks = BLOCKS;
            $(let blocks = $blocks;)?
            assert!(!blocks.contains(|c| !BLOCKS.contains(c)));
            if (mode.is_none() || mode == Some($mode)) && blocks.contains(block) {
                t!(@$mode: format!("{}{}", text, suffix) => $($token,)* token);
            }
        }
    }};
    (@$mode:ident: $text:expr => $($token:expr),*) => {{
        let text = $text;
        let found = Tokens::new(&text, $mode).collect::<Vec<_>>();
        let expected = vec![$($token.clone()),*];
        check(&text, found, expected);
    }};
}

#[test]
fn test_tokenize_brackets() {
    // Test in markup.
    t!(Markup: "{"       => LeftBrace);
    t!(Markup: "}"       => RightBrace);
    t!(Markup: "["       => LeftBracket);
    t!(Markup: "]"       => RightBracket);
    t!(Markup[" /"]: "(" => Text("("));
    t!(Markup[" /"]: ")" => Text(")"));

    // Test in code.
    t!(Code: "{" => LeftBrace);
    t!(Code: "}" => RightBrace);
    t!(Code: "[" => LeftBracket);
    t!(Code: "]" => RightBracket);
    t!(Code: "(" => LeftParen);
    t!(Code: ")" => RightParen);
}

#[test]
fn test_tokenize_whitespace() {
    // Test basic whitespace.
    t!(Both["a1/"]: ""         => );
    t!(Both["a1/"]: " "        => Space(0));
    t!(Both["a1/"]: "    "     => Space(0));
    t!(Both["a1/"]: "\t"       => Space(0));
    t!(Both["a1/"]: "  \t"     => Space(0));
    t!(Both["a1/"]: "\u{202F}" => Space(0));

    // Test newline counting.
    t!(Both["a1/"]: "\n"           => Space(1));
    t!(Both["a1/"]: "\n "          => Space(1));
    t!(Both["a1/"]: "  \n"         => Space(1));
    t!(Both["a1/"]: "  \n   "      => Space(1));
    t!(Both["a1/"]: "\r\n"         => Space(1));
    t!(Both["a1/"]: "\r\n\r"       => Space(2));
    t!(Both["a1/"]: "  \n\t \n  "  => Space(2));
    t!(Both["a1/"]: "\n\r"         => Space(2));
    t!(Both["a1/"]: " \r\r\n \x0D" => Space(3));
}

#[test]
fn test_tokenize_text() {
    // Test basic text.
    t!(Markup[" /"]: "hello"      => Text("hello"));
    t!(Markup[" /"]: "reha-world" => Text("reha-world"));

    // Test code symbols in text.
    t!(Markup[" /"]: "a():\"b" => Text("a()"), Colon, SmartQuote { double: true }, Text("b"));
    t!(Markup[" /"]: ";,|/+"  => Text(";,|/+"));
    t!(Markup[" /"]: "=-a"     => Eq, Minus, Text("a"));
    t!(Markup[" "]: "#123"     => Text("#123"));

    // Test text ends.
    t!(Markup[""]: "hello " => Text("hello"), Space(0));
    t!(Markup[""]: "hello~" => Text("hello"), Shorthand('\u{00A0}'));
}

#[test]
fn test_tokenize_escape_sequences() {
    // Test escapable symbols.
    t!(Markup: r"\\" => Escape('\\'));
    t!(Markup: r"\/" => Escape('/'));
    t!(Markup: r"\[" => Escape('['));
    t!(Markup: r"\]" => Escape(']'));
    t!(Markup: r"\{" => Escape('{'));
    t!(Markup: r"\}" => Escape('}'));
    t!(Markup: r"\*" => Escape('*'));
    t!(Markup: r"\_" => Escape('_'));
    t!(Markup: r"\=" => Escape('='));
    t!(Markup: r"\~" => Escape('~'));
    t!(Markup: r"\'" => Escape('\''));
    t!(Markup: r#"\""# => Escape('"'));
    t!(Markup: r"\`" => Escape('`'));
    t!(Markup: r"\$" => Escape('$'));
    t!(Markup: r"\#" => Escape('#'));
    t!(Markup: r"\a"   => Escape('a'));
    t!(Markup: r"\u"   => Escape('u'));
    t!(Markup: r"\1"   => Escape('1'));

    // Test basic unicode escapes.
    t!(Markup: r"\u{}"     => Error(Full, "invalid unicode escape sequence"));
    t!(Markup: r"\u{2603}" => Escape('☃'));
    t!(Markup: r"\u{P}"    => Error(Full, "invalid unicode escape sequence"));

    // Test unclosed unicode escapes.
    t!(Markup[" /"]: r"\u{"     => Error(End, "expected closing brace"));
    t!(Markup[" /"]: r"\u{1"    => Error(End, "expected closing brace"));
    t!(Markup[" /"]: r"\u{26A4" => Error(End, "expected closing brace"));
    t!(Markup[" /"]: r"\u{1Q3P" => Error(End, "expected closing brace"));
    t!(Markup: r"\u{1🏕}"       => Error(End, "expected closing brace"), Text("🏕"), RightBrace);
}

#[test]
fn test_tokenize_markup_symbols() {
    // Test markup tokens.
    t!(Markup[" a1"]: "*"   => Star);
    t!(Markup: "_"          => Underscore);
    t!(Markup[""]: "==="    => Eq, Eq, Eq);
    t!(Markup["a1/"]: "= "  => Eq, Space(0));
    t!(Markup[" "]: r"\"    => Linebreak);
    t!(Markup: "~"          => Shorthand('\u{00A0}'));
    t!(Markup["a1/"]: "-?"  => Shorthand('\u{00AD}'));
    t!(Markup["a "]: r"a--" => Text("a"), Shorthand('\u{2013}'));
    t!(Markup["a1/"]: "- "  => Minus, Space(0));
    t!(Markup[" "]: "+"     => Plus);
    t!(Markup[" "]: "1."    => EnumNumbering(NonZeroUsize::new(1).unwrap()));
    t!(Markup[" "]: "1.a"   => EnumNumbering(NonZeroUsize::new(1).unwrap()), Text("a"));
    t!(Markup[" /"]: "a1."  => Text("a1."));
}

#[test]
fn test_tokenize_code_symbols() {
    // Test all symbols.
    t!(Code: ","        => Comma);
    t!(Code: ";"        => Semicolon);
    t!(Code: ":"        => Colon);
    t!(Code: "+"        => Plus);
    t!(Code: "-"        => Minus);
    t!(Code[" a1"]: "*" => Star);
    t!(Code[" a1"]: "/" => Slash);
    t!(Code[" a/"]: "." => Dot);
    t!(Code: "="        => Eq);
    t!(Code: "=="       => EqEq);
    t!(Code: "!="       => ExclEq);
    t!(Code[" /"]: "<"  => Lt);
    t!(Code: "<="       => LtEq);
    t!(Code: ">"        => Gt);
    t!(Code: ">="       => GtEq);
    t!(Code: "+="       => PlusEq);
    t!(Code: "-="       => HyphEq);
    t!(Code: "*="       => StarEq);
    t!(Code: "/="       => SlashEq);
    t!(Code: ".."       => Dots);
    t!(Code: "=>"       => Arrow);

    // Test combinations.
    t!(Code: "<=>"        => LtEq, Gt);
    t!(Code[" a/"]: "..." => Dots, Dot);

    // Test hyphen as symbol vs part of identifier.
    t!(Code[" /"]: "-1"   => Minus, Int(1));
    t!(Code[" /"]: "-a"   => Minus, Ident("a"));
    t!(Code[" /"]: "--1"  => Minus, Minus, Int(1));
    t!(Code[" /"]: "--_a" => Minus, Minus, Ident("_a"));
    t!(Code[" /"]: "a-b"  => Ident("a-b"));

    // Test invalid.
    t!(Code: r"\" => Error(Full, "not valid here"));
}

#[test]
fn test_tokenize_keywords() {
    // A list of a few (not all) keywords.
    let list = [
        ("not", Not),
        ("let", Let),
        ("if", If),
        ("else", Else),
        ("for", For),
        ("in", In),
        ("import", Import),
    ];

    for (s, t) in list.clone() {
        t!(Markup[" "]: format!("#{}", s) => t);
        t!(Markup[" "]: format!("#{0}#{0}", s) => t, t);
        t!(Markup[" /"]: format!("# {}", s) => Text(&format!("# {s}")));
    }

    for (s, t) in list {
        t!(Code[" "]: s => t);
        t!(Markup[" /"]: s => Text(s));
    }

    // Test simple identifier.
    t!(Markup[" "]: "#letter" => Ident("letter"));
    t!(Code[" /"]: "falser"   => Ident("falser"));
    t!(Code[" /"]: "None"     => Ident("None"));
    t!(Code[" /"]: "True"     => Ident("True"));
}

#[test]
fn test_tokenize_raw_blocks() {
    // Test basic raw block.
    t!(Markup: "``"     => Raw("", None, false));
    t!(Markup: "`raw`"  => Raw("raw", None, false));
    t!(Markup[""]: "`]" => Error(End, "expected 1 backtick"));

    // Test special symbols in raw block.
    t!(Markup: "`[brackets]`" => Raw("[brackets]", None, false));
    t!(Markup[""]: r"`\`` "   => Raw(r"\", None, false), Error(End, "expected 1 backtick"));

    // Test separated closing backticks.
    t!(Markup: "```not `y`e`t```" => Raw("`y`e`t", Some("not"), false));

    // Test more backticks.
    t!(Markup: "``nope``"             => Raw("", None, false), Text("nope"), Raw("", None, false));
    t!(Markup: "````🚀````"           => Raw("", None, false));
    t!(Markup[""]: "`````👩‍🚀````noend" => Error(End, "expected 5 backticks"));
    t!(Markup[""]: "````raw``````"    => Raw("", Some("raw"), false), Raw("", None, false));
}

#[test]
fn test_tokenize_idents() {
    // Test valid identifiers.
    t!(Code[" /"]: "x"           => Ident("x"));
    t!(Code[" /"]: "value"       => Ident("value"));
    t!(Code[" /"]: "__main__"    => Ident("__main__"));
    t!(Code[" /"]: "_snake_case" => Ident("_snake_case"));

    // Test non-ascii.
    t!(Code[" /"]: "α"    => Ident("α"));
    t!(Code[" /"]: "ម្តាយ" => Ident("ម្តាយ"));

    // Test hyphen parsed as identifier.
    t!(Code[" /"]: "kebab-case" => Ident("kebab-case"));
    t!(Code[" /"]: "one-10"     => Ident("one-10"));
}

#[test]
fn test_tokenize_numeric() {
    let ints = [("7", 7), ("012", 12)];
    let floats = [
        (".3", 0.3),
        ("0.3", 0.3),
        ("3.", 3.0),
        ("3.0", 3.0),
        ("14.3", 14.3),
        ("10e2", 1000.0),
        ("10e+0", 10.0),
        ("10e+1", 100.0),
        ("10e-2", 0.1),
        ("10.e1", 100.0),
        ("10.e-1", 1.0),
        (".1e1", 1.0),
        ("10E2", 1000.0),
    ];

    // Test integers.
    for &(s, v) in &ints {
        t!(Code[" /"]: s => Int(v));
    }

    // Test floats.
    for &(s, v) in &floats {
        t!(Code[" /"]: s => Float(v));
    }

    // Test attached numbers.
    t!(Code[" /"]: ".2.3"  => Float(0.2), Float(0.3));
    t!(Code[" /"]: "1.2.3"  => Float(1.2), Float(0.3));
    t!(Code[" /"]: "1e-2+3" => Float(0.01), Plus, Int(3));

    // Test float from too large integer.
    let large = i64::MAX as f64 + 1.0;
    t!(Code[" /"]: large.to_string() => Float(large));

    // Combined integers and floats.
    let nums = ints.iter().map(|&(k, v)| (k, v as f64)).chain(floats);

    let suffixes: &[(&str, fn(f64) -> SyntaxKind)] = &[
        ("mm", |x| Numeric(x, Unit::Length(AbsUnit::Mm))),
        ("pt", |x| Numeric(x, Unit::Length(AbsUnit::Pt))),
        ("cm", |x| Numeric(x, Unit::Length(AbsUnit::Cm))),
        ("in", |x| Numeric(x, Unit::Length(AbsUnit::In))),
        ("rad", |x| Numeric(x, Unit::Angle(AngleUnit::Rad))),
        ("deg", |x| Numeric(x, Unit::Angle(AngleUnit::Deg))),
        ("em", |x| Numeric(x, Unit::Em)),
        ("fr", |x| Numeric(x, Unit::Fr)),
        ("%", |x| Numeric(x, Unit::Percent)),
    ];

    // Numeric types.
    for &(suffix, build) in suffixes {
        for (s, v) in nums.clone() {
            t!(Code[" /"]: format!("{}{}", s, suffix) => build(v));
        }
    }

    // Multiple dots close the number.
    t!(Code[" /"]: "1..2"   => Int(1), Dots, Int(2));
    t!(Code[" /"]: "1..2.3" => Int(1), Dots, Float(2.3));
    t!(Code[" /"]: "1.2..3" => Float(1.2), Dots, Int(3));

    // Test invalid.
    t!(Code[" /"]: "1foo" => Error(Full, "invalid number suffix"));
}

#[test]
fn test_tokenize_strings() {
    // Test basic strings.
    t!(Code: "\"hi\""        => Str("hi"));
    t!(Code: "\"hi\nthere\"" => Str("hi\nthere"));
    t!(Code: "\"🌎\""        => Str("🌎"));

    // Test unterminated.
    t!(Code[""]: "\"hi" => Error(End, "expected quote"));

    // Test escaped quote.
    t!(Code: r#""a\"bc""# => Str("a\"bc"));
    t!(Code[""]: r#""\""# => Error(End, "expected quote"));
}

#[test]
fn test_tokenize_line_comments() {
    // Test line comment with no trailing newline.
    t!(Both[""]: "//" => LineComment);

    // Test line comment ends at newline.
    t!(Both["a1/"]: "//bc\n"   => LineComment, Space(1));
    t!(Both["a1/"]: "// bc \n" => LineComment, Space(1));
    t!(Both["a1/"]: "//bc\r\n" => LineComment, Space(1));

    // Test nested line comments.
    t!(Both["a1/"]: "//a//b\n" => LineComment, Space(1));
}

#[test]
fn test_tokenize_block_comments() {
    // Test basic block comments.
    t!(Both[""]: "/*" => BlockComment);
    t!(Both: "/**/"   => BlockComment);
    t!(Both: "/*🏞*/" => BlockComment);
    t!(Both: "/*\n*/" => BlockComment);

    // Test depth 1 and 2 nested block comments.
    t!(Both: "/* /* */ */"  => BlockComment);
    t!(Both: "/*/*/**/*/*/" => BlockComment);

    // Test two nested, one unclosed block comments.
    t!(Both[""]: "/*/*/**/*/" => BlockComment);

    // Test all combinations of up to two following slashes and stars.
    t!(Both[""]: "/*"   => BlockComment);
    t!(Both[""]: "/*/"  => BlockComment);
    t!(Both[""]: "/**"  => BlockComment);
    t!(Both[""]: "/*//" => BlockComment);
    t!(Both[""]: "/*/*" => BlockComment);
    t!(Both[""]: "/**/" => BlockComment);
    t!(Both[""]: "/***" => BlockComment);

    // Test unexpected terminator.
    t!(Both: "/*Hi*/*/" => BlockComment,
        Error(Full, "unexpected end of block comment"));
}