用到的HashTable:
HashTable.cpp:1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
using namespace std;
int operator % (const string & text, int max) {
int code = 0;
for (int i = 0; i < text.size(); ++ i) {
int code1 = text[i];
code1 = code1 << (i * 8 % 24);
code = code ^ code1;
}
return code % max;
}
class NoSuchKeyException {};
template <typename K, typename V>
class HashTable
{
private:
class Entry
{
public:
K key;
V value;
bool isInUse;
Entry() {
isInUse = false;
}
};
Entry * entries;
int capacity;
int count;
void initialize(int capacity2) {
count = 0;
capacity = capacity2;
entries = new Entry[capacity];
}
void assign(const HashTable & map2) {
count = map2.count;
capacity = map2.capacity;
entries = new Entry[capacity];
for (int i = 0; i < capacity; ++ i) {
entries[i] = map2.entries[i];
}
}
public:
HashTable() {
initialize(2);
}
~HashTable() {
delete [] entries;
}
HashTable(const HashTable & map2) {
assign(map2);
}
HashTable & operator = (const HashTable & map2) {
delete [] entries;
assign(map2);
return (*this);
}
void clear() {
delete [] entries;
initialize(2);
}
private:
int hashIndex(const K & key) const {
return key % capacity;
}
int find(const K & key) const {
int index = hashIndex(key);
while (true) {
if (! entries[index].isInUse) {
return index;
}
if (entries[index].key == key) {
return index;
}
index = (index + 1) % capacity;
}
}
void resize(int capacity2) {
Entry * entries0 = entries;
int capacity0 = capacity;
initialize(capacity2);
for (int i = 0; i < capacity0; ++ i) {
if (entries0[i].isInUse) {
put(entries0[i].key, entries0[i].value);
}
}
delete [] entries0;
}
public:
void put(const K & key, const V & value) {
int index = find(key);
entries[index].value = value;
if (entries[index].isInUse) return;
entries[index].isInUse = true;
entries[index].key = key;
++ count;
if (count > capacity / 2) {
resize(capacity * 2);
}
}
V get(const K & key) const {
int index = find(key);
if (! entries[index].isInUse) {
throw NoSuchKeyException();
}
return entries[index].value;
}
bool remove(const K & key) {
int index = find(key);
if (! entries[index].isInUse) return false;
fillNotInUseEntry(index);
-- count;
if (count < capacity / 4) {
resize(capacity / 2);
}
return true;
}
private:
void fillNotInUseEntry(int index) {
int next = index;
while (true) {
next = (next + 1) % capacity;
if (! entries[next].isInUse) {
entries[index].isInUse = false;
return;
}
int index0 = hashIndex(entries[next].key);
if (index < next) {
if (index0 > index &&
index0 <= next) continue;
}
else {
if (index0 > index ||
index0 <= next) continue;
}
entries[index] = entries[next];
index = next;
}
}
public:
bool containsKey(const K & key) const {
int index = find(key);
return (entries[index].isInUse);
}
int size() const {
return count;
}
vector<K> getKeys() const {
vector<K> vec;
for (int i = 0; i < capacity; ++ i) {
if (entries[i].isInUse) {
vec.push_back(entries[i].key);
}
}
return vec;
}
};
用到的token.cpp1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
using namespace std;
struct Token
{
int line;
int pos;
string type;
string text;
};
int operator % (const Token & t, int base) {
return (t.line * 12347 + t.pos) % base;
}
bool operator == (const Token & t1, const Token & t2) {
return t1.line == t2.line && t1.pos == t2.pos;
}
ostream & operator << (ostream & out, const Token & t) {
out << t.line << '\t' << t.pos << '\t' << t.type << "\t\"" << t.text << "\"";
return out;
}
void read_lines(const string & filename, vector<string> & lines) {
ifstream in(filename.c_str());
stringstream ss;
ss << "Cannot open file: " << filename;
if (in.fail()) throw runtime_error(ss.str());
string line;
while (getline(in, line)) {
lines.push_back(line);
}
in.close();
}
int find(const string & source, const string & to_find, int start=0) {
int len_total = source.size();
int len = to_find.size();
for (int i = start; i + len < len_total; ++ i) {
if (source.compare(i, len, to_find) == 0) return i;
}
return -1;
}
Token make_token(int line, int pos, int len, const string & source, const string & type) {
Token t;
t.line = line;
t.pos = pos;
t.type = type;
t.text = source.substr(pos, len);
return t;
}
Token find_indent(int line, const string & source) {
int len = 0;
for (int i = 0; i < source.size(); ++ i) {
if (source[i] == '\t')
++ len;
else
break;
}
return make_token(line, 0, len, source, "indent");
}
Token find_blank(int line, const string & source, int start) {
int len = 0;
for (int i = start; i < source.size(); ++ i) {
if (source[i] == '\t' || source[i] == ' ')
++ len;
else
break;
}
return make_token(line, start, len, source, "blank");
}
Token find_string(int line, const string & source, int start) {
int len = 0;
bool was_backslash = false;
for (int i = start; i < source.size(); ++ i) {
char c = source[i];
if (i == start) {
if (c == '"') ++ len;
else break;
}
else {
if (was_backslash) {
if (c == '\\' || c == 'r' || c == 'n' ||
c == 't' || c == '"')
++ len;
else break;
was_backslash = false;
}
else {
if (c == '"') {
++ len;
break;
}
++ len;
was_backslash = (c == '\\');
}
}
}
return make_token(line, start, len, source, "string");
}
Token find_int(int line, const string & source, int start) {
int len = 0;
for (int i = start; i < source.size(); ++ i) {
if ((i != start && source[i] >= '0' && source[i] <= '9' ) ||
(source[i] >= '1' && source[i] <= '9'))
++ len;
else
break;
}
return make_token(line, start, len, source, "int");
}
Token find_id(int line, const string & source, int start) {
int len = 0;
for (int i = start; i < source.size(); ++ i) {
if ((i != start && source[i] >= '0' && source[i] <= '9') ||
(source[i] >= 'A' && source[i] <= 'Z') ||
(source[i] >= 'a' && source[i] <= 'z') ||
source[i] == '_')
++ len;
else
break;
}
return make_token(line, start, len, source, "id");
}
const char *keywords[] = {
"def", "(", ")", ":", "if", "else", "for", "in", "+", "-", "*", "/", "%"
};
Token find_keyword(int line, const string & source, int start) {
int keyword = -1;
for (int i = 0; i < sizeof(keywords) / sizeof(const char *); ++ i) {
if (source.compare(start, strlen(keywords[i]), keywords[i]) == 0)
keyword = i;
}
if (keyword == -1)
return make_token(line, start, 0, source, "keyword");
return make_token(line, start, strlen(keywords[keyword]), source, "keyword");
}
Token find_comment(int line, const string & source, int start) {
int len = 0;
for (int i = start; i < source.size(); ++ i) {
if (i != start || source[i] == '#')
++ len;
else
break;
}
return make_token(line, start, len, source, "comment");
}
typedef Token (* find_function)(int, const string &, int);
void add_tokens(HashTable<Token, Token> & tokens, int lineno, const string & line, find_function find, bool can_conflict) {
int start = 0;
while (start < line.size()) {
Token t = find(lineno, line, start);
if (t.text.size() > 0) {
if (can_conflict) {
if (tokens.containsKey(t)) {
Token old_t = tokens.get(t);
if (old_t.text.size() < t.text.size())
tokens.put(t, t);
}
else {
tokens.put(t, t);
}
}
else {
if (tokens.containsKey(t)) {
Token old_t = tokens.get(t);
stringstream ss;
ss << "Conflicting tokens:" << endl;
ss << old_t << endl;
ss << t << endl;
throw runtime_error(ss.str());
}
tokens.put(t, t);
}
start += t.text.size();
}
else
++ start;
}
}
void get_tokens(const string & filename, vector<vector<Token> > & tokens) {
vector<string> lines;
read_lines(filename, lines);
HashTable<Token, Token> all_tokens;
for (int i = 0; i < lines.size(); ++ i) {
Token indent = find_indent(i, lines[i]);
if (indent.text.size() > 0)
all_tokens.put(indent, indent);
add_tokens(all_tokens, i, lines[i], find_string, false);
add_tokens(all_tokens, i, lines[i], find_int, false);
add_tokens(all_tokens, i, lines[i], find_comment, false);
add_tokens(all_tokens, i, lines[i], find_keyword, false);
add_tokens(all_tokens, i, lines[i], find_id, true);
add_tokens(all_tokens, i, lines[i], find_blank, true);
}
Token index;
for (int i = 0; i < lines.size(); ++ i) {
tokens.push_back(vector<Token>());
index.line = i;
index.pos = 0;
while (index.pos < lines[i].size()) {
if (! all_tokens.containsKey(index)) {
stringstream ss;
ss << "Lexical error at line #" << (index.line + 1) << " char #" << index.pos;
throw runtime_error(ss.str());
}
Token t = all_tokens.get(index);
if (t.type != "blank")
tokens[i].push_back(t);
index.pos += t.text.size();
}
}
}
int main() {
vector<vector<Token> > tokens;
get_tokens("test.py", tokens);
for (int i = 0; i < tokens.size(); ++ i)
for (int j = 0; j < tokens[i].size(); ++ j)
cout << tokens[i][j] << endl;
}
测试的python:
test.py
1 |
|