summaryrefslogtreecommitdiff
path: root/src/lexer.c
blob: e645bee311d9d685e5b5edd757b8b37f1210441a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
// src/lexer.c

#define CMMM__RS274NGC__LEXEME__STRIP_VENDOR
#define CMMM__RS274NGC__LEXER__STRIP_VENDOR
#include "cmmm/rs274ngc/lexer.h"
#include <regex.h>
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <ctype.h>

#define X(SYM, REG, ...)                                                       \
  [RS274NGC__LEXEME__KIND__##SYM - RS274NGC__LEXEME__REGEX_OFFSET] = "^(" REG ")",
static const char *lexeme_regex_strings[RS274NGC__LEXEME__REGEX_COUNT] = {
  RS274NGC__LEXEME__KINDS ()
};
#undef X

static regex_t lexeme_regexes[RS274NGC__LEXEME__REGEX_COUNT];

static void
print_regex_error (const regex_t *comp, const char *regex, int errcode)
{
  size_t length = regerror (errcode, comp, NULL, 0);
  char *buffer = malloc (length);
  regerror (errcode, comp, buffer, length);
  fprintf (stderr, "error: regcomp (\"%s\"): %s\n", regex, buffer);
  free (buffer);
}

static void __attribute__((constructor))
compile_regexes (void)
{
  bool errored = false;

  for (int i = 0; i < RS274NGC__LEXEME__REGEX_COUNT; ++i) {
    regex_t *compiled = &lexeme_regexes[i];
    const char *pattern = lexeme_regex_strings[i];
    int errcode = regcomp (compiled, pattern, REG_EXTENDED);
    if (errcode) {
      print_regex_error (compiled, pattern, errcode);
      errored = true;
    }
  }

  if (errored)
    exit (EXIT_FAILURE);
}

struct rs274ngc__lexer
rs274ngc__lexer__from_string_view (struct cmmm__string_view sv,
                                   const char *filename)
{
  struct rs274ngc__lexer lexer = {
    .sv = sv,
    .cursor = sv.begin,
    .filename = filename,
  };

  rs274ngc__lexer__next_lexeme (&lexer);
  return lexer;
}

static void
skip_whitespace (struct rs274ngc__lexer *lexer)
{
  while (lexer->cursor < lexer->sv.end && (*lexer->cursor == ' ' ||
                                           *lexer->cursor == '\t'))
    lexer->cursor += 1;
}

void
rs274ngc__lexer__next_lexeme (struct rs274ngc__lexer *lexer)
{
  skip_whitespace (lexer);
  lexer->lexeme = (struct rs274ngc__lexeme) { .sv.begin = lexer->cursor };

  if (lexer->cursor >= lexer->sv.end) {
    lexer->lexeme.kind = RS274NGC__LEXEME__KIND__END_OF_FILE;
    lexer->lexeme.sv.end = lexer->sv.end;
    return;
  }

  if (*lexer->cursor == '\r') {
    if (*lexer->cursor == '\n')
      lexer->cursor += 1;

    lexer->lexeme.kind = RS274NGC__LEXEME__KIND__END_OF_LINE;
    lexer->lexeme.sv.end = ++lexer->cursor;
    return;
  }

  if (*lexer->cursor == '\n') {
    lexer->lexeme.kind = RS274NGC__LEXEME__KIND__END_OF_LINE;
    lexer->lexeme.sv.end = ++lexer->cursor;
    return;
  }

  regmatch_t regmatch;
  int longest_matching_regex = RS274NGC__LEXEME__REGEX_COUNT;
  int longest_matching_regex_length = 0;
  for (int i = 0; i < RS274NGC__LEXEME__REGEX_COUNT; ++i) {
    if (regexec (&lexeme_regexes[i], lexer->cursor, 1, &regmatch, 0)) continue;
    if (regmatch.rm_eo <= longest_matching_regex_length) continue;

    longest_matching_regex_length = regmatch.rm_eo;
    longest_matching_regex = i;

    // HACK(cmmm): choose first match
#ifdef RS274NGC__LEXER__FIRST_MATCH
    break;
#endif
  }

  if (longest_matching_regex != RS274NGC__LEXEME__REGEX_COUNT) {
    lexer->lexeme.kind = longest_matching_regex + RS274NGC__LEXEME__REGEX_OFFSET;
    lexer->lexeme.sv.end = lexer->cursor += longest_matching_regex_length;

    // HACK(cmmm): right trim lexemes
#ifdef RS274NGC__LEXER__TRIM
    while (lexer->lexeme.sv.end > lexer->lexeme.sv.begin &&
	   (lexer->lexeme.sv.end[-1] == ' ' ||
	    lexer->lexeme.sv.end[-1] == '\t')) {
      lexer->lexeme.sv.end -= 1;
    }
#endif

    switch (lexer->lexeme.kind) {
#ifdef RS274NGC__LEXER__FILTER_COMMENTS
    case RS274NGC__LEXEME__KIND__COMMENT:
      return rs274ngc__lexer__next_lexeme (lexer);
#endif
    default: return;
    }
  }

  // INVALID
  lexer->lexeme.sv.end = ++lexer->cursor;
}

// src/lexer.c ends here