283 lines
8 KiB
C++
283 lines
8 KiB
C++
/**************************************************************************/
|
|
/* aethex_tokenizer.cpp */
|
|
/**************************************************************************/
|
|
/* This file is part of: */
|
|
/* AETHEX ENGINE */
|
|
/* https://aethex.foundation */
|
|
/**************************************************************************/
|
|
/* Copyright (c) 2026-present AeThex Labs. */
|
|
/**************************************************************************/
|
|
|
|
#include "aethex_tokenizer.h"
|
|
|
|
#include <cctype>
|
|
|
|
char AeThexTokenizer::peek(int offset) const {
|
|
int idx = pos + offset;
|
|
if (idx >= source.length()) {
|
|
return '\0';
|
|
}
|
|
return source[idx];
|
|
}
|
|
|
|
char AeThexTokenizer::advance() {
|
|
char c = source[pos++];
|
|
if (c == '\n') {
|
|
line++;
|
|
column = 1;
|
|
} else {
|
|
column++;
|
|
}
|
|
return c;
|
|
}
|
|
|
|
bool AeThexTokenizer::match(char expected) {
|
|
if (peek() == expected) {
|
|
advance();
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void AeThexTokenizer::skip_whitespace() {
|
|
while (pos < source.length()) {
|
|
char c = peek();
|
|
if (c == ' ' || c == '\t' || c == '\r') {
|
|
advance();
|
|
} else if (c == '#') {
|
|
skip_comment();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
void AeThexTokenizer::skip_comment() {
|
|
// Skip until end of line
|
|
while (pos < source.length() && peek() != '\n') {
|
|
advance();
|
|
}
|
|
}
|
|
|
|
AeThexTokenizer::Token AeThexTokenizer::make_token(TokenType type, const String &value) {
|
|
Token token;
|
|
token.type = type;
|
|
token.value = value;
|
|
token.line = line;
|
|
token.column = column;
|
|
return token;
|
|
}
|
|
|
|
AeThexTokenizer::Token AeThexTokenizer::scan_string(char quote) {
|
|
String value;
|
|
while (pos < source.length() && peek() != quote) {
|
|
if (peek() == '\\' && pos + 1 < source.length()) {
|
|
advance(); // Skip backslash
|
|
char escaped = advance();
|
|
switch (escaped) {
|
|
case 'n': value += '\n'; break;
|
|
case 't': value += '\t'; break;
|
|
case 'r': value += '\r'; break;
|
|
case '\\': value += '\\'; break;
|
|
default: value += escaped; break;
|
|
}
|
|
} else {
|
|
value += advance();
|
|
}
|
|
}
|
|
|
|
if (pos >= source.length()) {
|
|
return make_token(TK_ERROR, "Unterminated string");
|
|
}
|
|
|
|
advance(); // Closing quote
|
|
return make_token(quote == '`' ? TK_TEMPLATE_STRING : TK_STRING, value);
|
|
}
|
|
|
|
AeThexTokenizer::Token AeThexTokenizer::scan_number() {
|
|
String value;
|
|
while (pos < source.length() && (isdigit(peek()) || peek() == '.')) {
|
|
value += advance();
|
|
}
|
|
return make_token(TK_NUMBER, value);
|
|
}
|
|
|
|
AeThexTokenizer::Token AeThexTokenizer::scan_identifier() {
|
|
String value;
|
|
while (pos < source.length() && (isalnum(peek()) || peek() == '_')) {
|
|
value += advance();
|
|
}
|
|
|
|
TokenType type = check_keyword(value);
|
|
return make_token(type, value);
|
|
}
|
|
|
|
AeThexTokenizer::TokenType AeThexTokenizer::check_keyword(const String &identifier) {
|
|
// Core constructs
|
|
if (identifier == "reality") return TK_REALITY;
|
|
if (identifier == "journey") return TK_JOURNEY;
|
|
if (identifier == "portal") return TK_PORTAL;
|
|
if (identifier == "beacon") return TK_BEACON;
|
|
if (identifier == "artifact") return TK_ARTIFACT;
|
|
if (identifier == "essence") return TK_ESSENCE;
|
|
if (identifier == "chronicle") return TK_CHRONICLE;
|
|
|
|
// Control flow
|
|
if (identifier == "when") return TK_WHEN;
|
|
if (identifier == "otherwise") return TK_OTHERWISE;
|
|
if (identifier == "traverse") return TK_TRAVERSE;
|
|
if (identifier == "while") return TK_WHILE;
|
|
if (identifier == "break") return TK_BREAK;
|
|
if (identifier == "continue") return TK_CONTINUE;
|
|
if (identifier == "return") return TK_RETURN;
|
|
if (identifier == "yield") return TK_YIELD;
|
|
|
|
// Data
|
|
if (identifier == "let") return TK_LET;
|
|
if (identifier == "const") return TK_CONST;
|
|
if (identifier == "mut") return TK_MUT;
|
|
if (identifier == "new") return TK_NEW;
|
|
|
|
// Platform
|
|
if (identifier == "platform") return TK_PLATFORM;
|
|
if (identifier == "sync") return TK_SYNC;
|
|
if (identifier == "async") return TK_ASYNC;
|
|
if (identifier == "await") return TK_AWAIT;
|
|
if (identifier == "across") return TK_ACROSS;
|
|
if (identifier == "all") return TK_ALL;
|
|
|
|
// Actions
|
|
if (identifier == "notify") return TK_NOTIFY;
|
|
if (identifier == "reveal") return TK_REVEAL;
|
|
if (identifier == "summon") return TK_SUMMON;
|
|
if (identifier == "banish") return TK_BANISH;
|
|
|
|
// Literals
|
|
if (identifier == "true") return TK_TRUE;
|
|
if (identifier == "false") return TK_FALSE;
|
|
if (identifier == "null") return TK_NULL;
|
|
if (identifier == "self") return TK_SELF;
|
|
if (identifier == "super") return TK_SUPER;
|
|
|
|
// Logical operators
|
|
if (identifier == "and") return TK_AND;
|
|
if (identifier == "or") return TK_OR;
|
|
if (identifier == "not") return TK_NOT;
|
|
|
|
return TK_IDENTIFIER;
|
|
}
|
|
|
|
Error AeThexTokenizer::tokenize(const String &p_source) {
|
|
source = p_source;
|
|
pos = 0;
|
|
line = 1;
|
|
column = 1;
|
|
tokens.clear();
|
|
|
|
while (pos < source.length()) {
|
|
skip_whitespace();
|
|
|
|
if (pos >= source.length()) {
|
|
break;
|
|
}
|
|
|
|
char c = peek();
|
|
|
|
// Newline
|
|
if (c == '\n') {
|
|
tokens.push_back(make_token(TK_NEWLINE));
|
|
advance();
|
|
continue;
|
|
}
|
|
|
|
// String literals
|
|
if (c == '"' || c == '\'' || c == '`') {
|
|
advance();
|
|
tokens.push_back(scan_string(c));
|
|
continue;
|
|
}
|
|
|
|
// Numbers
|
|
if (isdigit(c)) {
|
|
tokens.push_back(scan_number());
|
|
continue;
|
|
}
|
|
|
|
// Identifiers and keywords
|
|
if (isalpha(c) || c == '_') {
|
|
tokens.push_back(scan_identifier());
|
|
continue;
|
|
}
|
|
|
|
// Operators and punctuation
|
|
advance();
|
|
switch (c) {
|
|
case '+':
|
|
if (match('=')) tokens.push_back(make_token(TK_PLUS_EQUAL));
|
|
else tokens.push_back(make_token(TK_PLUS));
|
|
break;
|
|
case '-':
|
|
if (match('>')) tokens.push_back(make_token(TK_ARROW));
|
|
else if (match('=')) tokens.push_back(make_token(TK_MINUS_EQUAL));
|
|
else tokens.push_back(make_token(TK_MINUS));
|
|
break;
|
|
case '*': tokens.push_back(make_token(TK_STAR)); break;
|
|
case '/': tokens.push_back(make_token(TK_SLASH)); break;
|
|
case '%': tokens.push_back(make_token(TK_PERCENT)); break;
|
|
case '^': tokens.push_back(make_token(TK_CARET)); break;
|
|
case '=':
|
|
if (match('=')) tokens.push_back(make_token(TK_EQUAL_EQUAL));
|
|
else if (match('>')) tokens.push_back(make_token(TK_FAT_ARROW));
|
|
else tokens.push_back(make_token(TK_EQUAL));
|
|
break;
|
|
case '!':
|
|
if (match('=')) tokens.push_back(make_token(TK_NOT_EQUAL));
|
|
else tokens.push_back(make_token(TK_NOT));
|
|
break;
|
|
case '<':
|
|
if (match('=')) tokens.push_back(make_token(TK_LESS_EQUAL));
|
|
else tokens.push_back(make_token(TK_LESS));
|
|
break;
|
|
case '>':
|
|
if (match('=')) tokens.push_back(make_token(TK_GREATER_EQUAL));
|
|
else tokens.push_back(make_token(TK_GREATER));
|
|
break;
|
|
case '&':
|
|
if (match('&')) tokens.push_back(make_token(TK_AND));
|
|
break;
|
|
case '|':
|
|
if (match('|')) tokens.push_back(make_token(TK_OR));
|
|
break;
|
|
case ':': tokens.push_back(make_token(TK_COLON)); break;
|
|
case ';': tokens.push_back(make_token(TK_SEMICOLON)); break;
|
|
case ',': tokens.push_back(make_token(TK_COMMA)); break;
|
|
case '.': tokens.push_back(make_token(TK_DOT)); break;
|
|
case '(': tokens.push_back(make_token(TK_PAREN_OPEN)); break;
|
|
case ')': tokens.push_back(make_token(TK_PAREN_CLOSE)); break;
|
|
case '[': tokens.push_back(make_token(TK_BRACKET_OPEN)); break;
|
|
case ']': tokens.push_back(make_token(TK_BRACKET_CLOSE)); break;
|
|
case '{': tokens.push_back(make_token(TK_BRACE_OPEN)); break;
|
|
case '}': tokens.push_back(make_token(TK_BRACE_CLOSE)); break;
|
|
default:
|
|
tokens.push_back(make_token(TK_ERROR, String("Unexpected character: ") + c));
|
|
break;
|
|
}
|
|
}
|
|
|
|
tokens.push_back(make_token(TK_EOF));
|
|
return OK;
|
|
}
|
|
|
|
String AeThexTokenizer::token_type_to_string(TokenType type) {
|
|
switch (type) {
|
|
case TK_IDENTIFIER: return "IDENTIFIER";
|
|
case TK_NUMBER: return "NUMBER";
|
|
case TK_STRING: return "STRING";
|
|
case TK_REALITY: return "REALITY";
|
|
case TK_JOURNEY: return "JOURNEY";
|
|
case TK_WHEN: return "WHEN";
|
|
case TK_NOTIFY: return "NOTIFY";
|
|
case TK_EOF: return "EOF";
|
|
default: return "UNKNOWN";
|
|
}
|
|
}
|