From 1a4ca5bef333edd4ebd12a7a67495af54beeb5f3 Mon Sep 17 00:00:00 2001 From: Martin Michalec Date: Sun, 22 Feb 2026 06:17:42 +0300 Subject: add lexer implementation --- include/cmmm/rs274ngc/lexer.h | 35 +++++++++++ src/lexer.c | 142 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+) create mode 100644 include/cmmm/rs274ngc/lexer.h create mode 100644 src/lexer.c diff --git a/include/cmmm/rs274ngc/lexer.h b/include/cmmm/rs274ngc/lexer.h new file mode 100644 index 0000000..eca972e --- /dev/null +++ b/include/cmmm/rs274ngc/lexer.h @@ -0,0 +1,35 @@ +// include/cmmm/lexer.h + +#ifndef INCLUDED__RS274NGC__LEXER__H +#define INCLUDED__RS274NGC__LEXER__H + +// #define CMMM__RS274NGC__LEXER__STRIP_VENDOR + +#include +#include "cmmm/string-view.h" +#include "cmmm/rs274ngc/lexeme.h" + +#define RS274NGC__LEXER__FIRST_MATCH +#define RS274NGC__LEXER__TRIM +// #define RS274NGC__LEXER__FILTER_COMMENTS + +struct cmmm__rs274ngc__lexer { + struct cmmm__string_view sv; + const char *cursor; + const char *filename; + struct cmmm__rs274ngc__lexeme lexeme; +}; + +struct cmmm__rs274ngc__lexer cmmm__rs274ngc__lexer__from_string_view (struct cmmm__string_view, const char *filename); +void cmmm__rs274ngc__lexer__next_lexeme (struct cmmm__rs274ngc__lexer *); + +#ifdef CMMM__RS274NGC__LEXER__STRIP_VENDOR +#define RS274NGC__LEXER__STRIP_VENDOR CMMM__RS274NGC__LEXER__STRIP_VENDOR +#define rs274ngc__lexer cmmm__rs274ngc__lexer +#define rs274ngc__lexer__from_string_view cmmm__rs274ngc__lexer__from_string_view +#define rs274ngc__lexer__next_lexeme cmmm__rs274ngc__lexer__next_lexeme +#endif // CMMM__RS274NGC__LEXER__STRIP_VENDOR + +#endif // INCLUDED__RS274NGC__LEXER__H + +// include/cmmm/lexer.h ends here diff --git a/src/lexer.c b/src/lexer.c new file mode 100644 index 0000000..e645bee --- /dev/null +++ b/src/lexer.c @@ -0,0 +1,142 @@ +// src/lexer.c + +#define CMMM__RS274NGC__LEXEME__STRIP_VENDOR +#define CMMM__RS274NGC__LEXER__STRIP_VENDOR +#include "cmmm/rs274ngc/lexer.h" +#include +#include +#include +#include +#include +#include + +#define X(SYM, REG, ...) \ + [RS274NGC__LEXEME__KIND__##SYM - RS274NGC__LEXEME__REGEX_OFFSET] = "^(" REG ")", +static const char *lexeme_regex_strings[RS274NGC__LEXEME__REGEX_COUNT] = { + RS274NGC__LEXEME__KINDS () +}; +#undef X + +static regex_t lexeme_regexes[RS274NGC__LEXEME__REGEX_COUNT]; + +static void +print_regex_error (const regex_t *comp, const char *regex, int errcode) +{ + size_t length = regerror (errcode, comp, NULL, 0); + char *buffer = malloc (length); + regerror (errcode, comp, buffer, length); + fprintf (stderr, "error: regcomp (\"%s\"): %s\n", regex, buffer); + free (buffer); +} + +static void __attribute__((constructor)) +compile_regexes (void) +{ + bool errored = false; + + for (int i = 0; i < RS274NGC__LEXEME__REGEX_COUNT; ++i) { + regex_t *compiled = &lexeme_regexes[i]; + const char *pattern = lexeme_regex_strings[i]; + int errcode = regcomp (compiled, pattern, REG_EXTENDED); + if (errcode) { + print_regex_error (compiled, pattern, errcode); + errored = true; + } + } + + if (errored) + exit (EXIT_FAILURE); +} + +struct rs274ngc__lexer +rs274ngc__lexer__from_string_view (struct cmmm__string_view sv, + const char *filename) +{ + struct rs274ngc__lexer lexer = { + .sv = sv, + .cursor = sv.begin, + .filename = filename, + }; + + rs274ngc__lexer__next_lexeme (&lexer); + return lexer; +} + +static void +skip_whitespace (struct rs274ngc__lexer *lexer) +{ + while (lexer->cursor < lexer->sv.end && (*lexer->cursor == ' ' || + *lexer->cursor == '\t')) + lexer->cursor += 1; +} + +void +rs274ngc__lexer__next_lexeme (struct rs274ngc__lexer *lexer) +{ + skip_whitespace (lexer); + lexer->lexeme = (struct rs274ngc__lexeme) { .sv.begin = lexer->cursor }; + + if (lexer->cursor >= lexer->sv.end) { + lexer->lexeme.kind = RS274NGC__LEXEME__KIND__END_OF_FILE; + lexer->lexeme.sv.end = lexer->sv.end; + return; + } + + if (*lexer->cursor == '\r') { + if (*lexer->cursor == '\n') + lexer->cursor += 1; + + lexer->lexeme.kind = RS274NGC__LEXEME__KIND__END_OF_LINE; + lexer->lexeme.sv.end = ++lexer->cursor; + return; + } + + if (*lexer->cursor == '\n') { + lexer->lexeme.kind = RS274NGC__LEXEME__KIND__END_OF_LINE; + lexer->lexeme.sv.end = ++lexer->cursor; + return; + } + + regmatch_t regmatch; + int longest_matching_regex = RS274NGC__LEXEME__REGEX_COUNT; + int longest_matching_regex_length = 0; + for (int i = 0; i < RS274NGC__LEXEME__REGEX_COUNT; ++i) { + if (regexec (&lexeme_regexes[i], lexer->cursor, 1, ®match, 0)) continue; + if (regmatch.rm_eo <= longest_matching_regex_length) continue; + + longest_matching_regex_length = regmatch.rm_eo; + longest_matching_regex = i; + + // HACK(cmmm): choose first match +#ifdef RS274NGC__LEXER__FIRST_MATCH + break; +#endif + } + + if (longest_matching_regex != RS274NGC__LEXEME__REGEX_COUNT) { + lexer->lexeme.kind = longest_matching_regex + RS274NGC__LEXEME__REGEX_OFFSET; + lexer->lexeme.sv.end = lexer->cursor += longest_matching_regex_length; + + // HACK(cmmm): right trim lexemes +#ifdef RS274NGC__LEXER__TRIM + while (lexer->lexeme.sv.end > lexer->lexeme.sv.begin && + (lexer->lexeme.sv.end[-1] == ' ' || + lexer->lexeme.sv.end[-1] == '\t')) { + lexer->lexeme.sv.end -= 1; + } +#endif + + switch (lexer->lexeme.kind) { +#ifdef RS274NGC__LEXER__FILTER_COMMENTS + case RS274NGC__LEXEME__KIND__COMMENT: + return rs274ngc__lexer__next_lexeme (lexer); +#endif + default: return; + } + } + + // INVALID + lexer->lexeme.sv.end = ++lexer->cursor; +} + +// src/lexer.c ends here -- cgit v1.3