// src/lexer.c #define CMMM__RS274NGC__LEXEME__STRIP_VENDOR #define CMMM__RS274NGC__LEXER__STRIP_VENDOR #include "cmmm/rs274ngc/lexer.h" #include #include #include #include #include #include #define X(SYM, REG, ...) \ [RS274NGC__LEXEME__KIND__##SYM - RS274NGC__LEXEME__REGEX_OFFSET] = "^(" REG ")", static const char *lexeme_regex_strings[RS274NGC__LEXEME__REGEX_COUNT] = { RS274NGC__LEXEME__KINDS () }; #undef X static regex_t lexeme_regexes[RS274NGC__LEXEME__REGEX_COUNT]; static void print_regex_error (const regex_t *comp, const char *regex, int errcode) { size_t length = regerror (errcode, comp, NULL, 0); char *buffer = malloc (length); regerror (errcode, comp, buffer, length); fprintf (stderr, "error: regcomp (\"%s\"): %s\n", regex, buffer); free (buffer); } static void __attribute__((constructor)) compile_regexes (void) { bool errored = false; for (int i = 0; i < RS274NGC__LEXEME__REGEX_COUNT; ++i) { regex_t *compiled = &lexeme_regexes[i]; const char *pattern = lexeme_regex_strings[i]; int errcode = regcomp (compiled, pattern, REG_EXTENDED); if (errcode) { print_regex_error (compiled, pattern, errcode); errored = true; } } if (errored) exit (EXIT_FAILURE); } struct rs274ngc__lexer rs274ngc__lexer__from_string_view (struct cmmm__string_view sv, const char *filename) { struct rs274ngc__lexer lexer = { .sv = sv, .cursor = sv.begin, .filename = filename, }; rs274ngc__lexer__next_lexeme (&lexer); return lexer; } static void skip_whitespace (struct rs274ngc__lexer *lexer) { while (lexer->cursor < lexer->sv.end && (*lexer->cursor == ' ' || *lexer->cursor == '\t')) lexer->cursor += 1; } void rs274ngc__lexer__next_lexeme (struct rs274ngc__lexer *lexer) { skip_whitespace (lexer); lexer->lexeme = (struct rs274ngc__lexeme) { .sv.begin = lexer->cursor }; if (lexer->cursor >= lexer->sv.end) { lexer->lexeme.kind = RS274NGC__LEXEME__KIND__END_OF_FILE; lexer->lexeme.sv.end = lexer->sv.end; return; } if (*lexer->cursor == '\r') { if (*lexer->cursor == '\n') lexer->cursor += 1; lexer->lexeme.kind = RS274NGC__LEXEME__KIND__END_OF_LINE; lexer->lexeme.sv.end = ++lexer->cursor; return; } if (*lexer->cursor == '\n') { lexer->lexeme.kind = RS274NGC__LEXEME__KIND__END_OF_LINE; lexer->lexeme.sv.end = ++lexer->cursor; return; } regmatch_t regmatch; int longest_matching_regex = RS274NGC__LEXEME__REGEX_COUNT; int longest_matching_regex_length = 0; for (int i = 0; i < RS274NGC__LEXEME__REGEX_COUNT; ++i) { if (regexec (&lexeme_regexes[i], lexer->cursor, 1, ®match, 0)) continue; if (regmatch.rm_eo <= longest_matching_regex_length) continue; longest_matching_regex_length = regmatch.rm_eo; longest_matching_regex = i; // HACK(cmmm): choose first match #ifdef RS274NGC__LEXER__FIRST_MATCH break; #endif } if (longest_matching_regex != RS274NGC__LEXEME__REGEX_COUNT) { lexer->lexeme.kind = longest_matching_regex + RS274NGC__LEXEME__REGEX_OFFSET; lexer->lexeme.sv.end = lexer->cursor += longest_matching_regex_length; // HACK(cmmm): right trim lexemes #ifdef RS274NGC__LEXER__TRIM while (lexer->lexeme.sv.end > lexer->lexeme.sv.begin && (lexer->lexeme.sv.end[-1] == ' ' || lexer->lexeme.sv.end[-1] == '\t')) { lexer->lexeme.sv.end -= 1; } #endif switch (lexer->lexeme.kind) { #ifdef RS274NGC__LEXER__FILTER_COMMENTS case RS274NGC__LEXEME__KIND__COMMENT: return rs274ngc__lexer__next_lexeme (lexer); #endif default: return; } } // INVALID lexer->lexeme.sv.end = ++lexer->cursor; } // src/lexer.c ends here