diff options
| -rw-r--r-- | include/cmmm/rs274ngc/lexer.h | 35 | ||||
| -rw-r--r-- | src/lexer.c | 142 |
2 files changed, 177 insertions, 0 deletions
diff --git a/include/cmmm/rs274ngc/lexer.h b/include/cmmm/rs274ngc/lexer.h new file mode 100644 index 0000000..eca972e --- /dev/null +++ b/include/cmmm/rs274ngc/lexer.h | |||
| @@ -0,0 +1,35 @@ | |||
| 1 | // include/cmmm/lexer.h | ||
| 2 | |||
| 3 | #ifndef INCLUDED__RS274NGC__LEXER__H | ||
| 4 | #define INCLUDED__RS274NGC__LEXER__H | ||
| 5 | |||
| 6 | // #define CMMM__RS274NGC__LEXER__STRIP_VENDOR | ||
| 7 | |||
| 8 | #include <stddef.h> | ||
| 9 | #include "cmmm/string-view.h" | ||
| 10 | #include "cmmm/rs274ngc/lexeme.h" | ||
| 11 | |||
| 12 | #define RS274NGC__LEXER__FIRST_MATCH | ||
| 13 | #define RS274NGC__LEXER__TRIM | ||
| 14 | // #define RS274NGC__LEXER__FILTER_COMMENTS | ||
| 15 | |||
| 16 | struct cmmm__rs274ngc__lexer { | ||
| 17 | struct cmmm__string_view sv; | ||
| 18 | const char *cursor; | ||
| 19 | const char *filename; | ||
| 20 | struct cmmm__rs274ngc__lexeme lexeme; | ||
| 21 | }; | ||
| 22 | |||
| 23 | struct cmmm__rs274ngc__lexer cmmm__rs274ngc__lexer__from_string_view (struct cmmm__string_view, const char *filename); | ||
| 24 | void cmmm__rs274ngc__lexer__next_lexeme (struct cmmm__rs274ngc__lexer *); | ||
| 25 | |||
| 26 | #ifdef CMMM__RS274NGC__LEXER__STRIP_VENDOR | ||
| 27 | #define RS274NGC__LEXER__STRIP_VENDOR CMMM__RS274NGC__LEXER__STRIP_VENDOR | ||
| 28 | #define rs274ngc__lexer cmmm__rs274ngc__lexer | ||
| 29 | #define rs274ngc__lexer__from_string_view cmmm__rs274ngc__lexer__from_string_view | ||
| 30 | #define rs274ngc__lexer__next_lexeme cmmm__rs274ngc__lexer__next_lexeme | ||
| 31 | #endif // CMMM__RS274NGC__LEXER__STRIP_VENDOR | ||
| 32 | |||
| 33 | #endif // INCLUDED__RS274NGC__LEXER__H | ||
| 34 | |||
| 35 | // include/cmmm/lexer.h ends here | ||
diff --git a/src/lexer.c b/src/lexer.c new file mode 100644 index 0000000..e645bee --- /dev/null +++ b/src/lexer.c | |||
| @@ -0,0 +1,142 @@ | |||
| 1 | // src/lexer.c | ||
| 2 | |||
| 3 | #define CMMM__RS274NGC__LEXEME__STRIP_VENDOR | ||
| 4 | #define CMMM__RS274NGC__LEXER__STRIP_VENDOR | ||
| 5 | #include "cmmm/rs274ngc/lexer.h" | ||
| 6 | #include <regex.h> | ||
| 7 | #include <assert.h> | ||
| 8 | #include <stdlib.h> | ||
| 9 | #include <stdio.h> | ||
| 10 | #include <stdbool.h> | ||
| 11 | #include <ctype.h> | ||
| 12 | |||
| 13 | #define X(SYM, REG, ...) \ | ||
| 14 | [RS274NGC__LEXEME__KIND__##SYM - RS274NGC__LEXEME__REGEX_OFFSET] = "^(" REG ")", | ||
| 15 | static const char *lexeme_regex_strings[RS274NGC__LEXEME__REGEX_COUNT] = { | ||
| 16 | RS274NGC__LEXEME__KINDS () | ||
| 17 | }; | ||
| 18 | #undef X | ||
| 19 | |||
| 20 | static regex_t lexeme_regexes[RS274NGC__LEXEME__REGEX_COUNT]; | ||
| 21 | |||
| 22 | static void | ||
| 23 | print_regex_error (const regex_t *comp, const char *regex, int errcode) | ||
| 24 | { | ||
| 25 | size_t length = regerror (errcode, comp, NULL, 0); | ||
| 26 | char *buffer = malloc (length); | ||
| 27 | regerror (errcode, comp, buffer, length); | ||
| 28 | fprintf (stderr, "error: regcomp (\"%s\"): %s\n", regex, buffer); | ||
| 29 | free (buffer); | ||
| 30 | } | ||
| 31 | |||
| 32 | static void __attribute__((constructor)) | ||
| 33 | compile_regexes (void) | ||
| 34 | { | ||
| 35 | bool errored = false; | ||
| 36 | |||
| 37 | for (int i = 0; i < RS274NGC__LEXEME__REGEX_COUNT; ++i) { | ||
| 38 | regex_t *compiled = &lexeme_regexes[i]; | ||
| 39 | const char *pattern = lexeme_regex_strings[i]; | ||
| 40 | int errcode = regcomp (compiled, pattern, REG_EXTENDED); | ||
| 41 | if (errcode) { | ||
| 42 | print_regex_error (compiled, pattern, errcode); | ||
| 43 | errored = true; | ||
| 44 | } | ||
| 45 | } | ||
| 46 | |||
| 47 | if (errored) | ||
| 48 | exit (EXIT_FAILURE); | ||
| 49 | } | ||
| 50 | |||
| 51 | struct rs274ngc__lexer | ||
| 52 | rs274ngc__lexer__from_string_view (struct cmmm__string_view sv, | ||
| 53 | const char *filename) | ||
| 54 | { | ||
| 55 | struct rs274ngc__lexer lexer = { | ||
| 56 | .sv = sv, | ||
| 57 | .cursor = sv.begin, | ||
| 58 | .filename = filename, | ||
| 59 | }; | ||
| 60 | |||
| 61 | rs274ngc__lexer__next_lexeme (&lexer); | ||
| 62 | return lexer; | ||
| 63 | } | ||
| 64 | |||
| 65 | static void | ||
| 66 | skip_whitespace (struct rs274ngc__lexer *lexer) | ||
| 67 | { | ||
| 68 | while (lexer->cursor < lexer->sv.end && (*lexer->cursor == ' ' || | ||
| 69 | *lexer->cursor == '\t')) | ||
| 70 | lexer->cursor += 1; | ||
| 71 | } | ||
| 72 | |||
| 73 | void | ||
| 74 | rs274ngc__lexer__next_lexeme (struct rs274ngc__lexer *lexer) | ||
| 75 | { | ||
| 76 | skip_whitespace (lexer); | ||
| 77 | lexer->lexeme = (struct rs274ngc__lexeme) { .sv.begin = lexer->cursor }; | ||
| 78 | |||
| 79 | if (lexer->cursor >= lexer->sv.end) { | ||
| 80 | lexer->lexeme.kind = RS274NGC__LEXEME__KIND__END_OF_FILE; | ||
| 81 | lexer->lexeme.sv.end = lexer->sv.end; | ||
| 82 | return; | ||
| 83 | } | ||
| 84 | |||
| 85 | if (*lexer->cursor == '\r') { | ||
| 86 | if (*lexer->cursor == '\n') | ||
| 87 | lexer->cursor += 1; | ||
| 88 | |||
| 89 | lexer->lexeme.kind = RS274NGC__LEXEME__KIND__END_OF_LINE; | ||
| 90 | lexer->lexeme.sv.end = ++lexer->cursor; | ||
| 91 | return; | ||
| 92 | } | ||
| 93 | |||
| 94 | if (*lexer->cursor == '\n') { | ||
| 95 | lexer->lexeme.kind = RS274NGC__LEXEME__KIND__END_OF_LINE; | ||
| 96 | lexer->lexeme.sv.end = ++lexer->cursor; | ||
| 97 | return; | ||
| 98 | } | ||
| 99 | |||
| 100 | regmatch_t regmatch; | ||
| 101 | int longest_matching_regex = RS274NGC__LEXEME__REGEX_COUNT; | ||
| 102 | int longest_matching_regex_length = 0; | ||
| 103 | for (int i = 0; i < RS274NGC__LEXEME__REGEX_COUNT; ++i) { | ||
| 104 | if (regexec (&lexeme_regexes[i], lexer->cursor, 1, ®match, 0)) continue; | ||
| 105 | if (regmatch.rm_eo <= longest_matching_regex_length) continue; | ||
| 106 | |||
| 107 | longest_matching_regex_length = regmatch.rm_eo; | ||
| 108 | longest_matching_regex = i; | ||
| 109 | |||
| 110 | // HACK(cmmm): choose first match | ||
| 111 | #ifdef RS274NGC__LEXER__FIRST_MATCH | ||
| 112 | break; | ||
| 113 | #endif | ||
| 114 | } | ||
| 115 | |||
| 116 | if (longest_matching_regex != RS274NGC__LEXEME__REGEX_COUNT) { | ||
| 117 | lexer->lexeme.kind = longest_matching_regex + RS274NGC__LEXEME__REGEX_OFFSET; | ||
| 118 | lexer->lexeme.sv.end = lexer->cursor += longest_matching_regex_length; | ||
| 119 | |||
| 120 | // HACK(cmmm): right trim lexemes | ||
| 121 | #ifdef RS274NGC__LEXER__TRIM | ||
| 122 | while (lexer->lexeme.sv.end > lexer->lexeme.sv.begin && | ||
| 123 | (lexer->lexeme.sv.end[-1] == ' ' || | ||
| 124 | lexer->lexeme.sv.end[-1] == '\t')) { | ||
| 125 | lexer->lexeme.sv.end -= 1; | ||
| 126 | } | ||
| 127 | #endif | ||
| 128 | |||
| 129 | switch (lexer->lexeme.kind) { | ||
| 130 | #ifdef RS274NGC__LEXER__FILTER_COMMENTS | ||
| 131 | case RS274NGC__LEXEME__KIND__COMMENT: | ||
| 132 | return rs274ngc__lexer__next_lexeme (lexer); | ||
| 133 | #endif | ||
| 134 | default: return; | ||
| 135 | } | ||
| 136 | } | ||
| 137 | |||
| 138 | // INVALID | ||
| 139 | lexer->lexeme.sv.end = ++lexer->cursor; | ||
| 140 | } | ||
| 141 | |||
| 142 | // src/lexer.c ends here | ||
