使用C++做一个简单的python语法分析器

分析两种语法树:

  • 每行的语法树
  • 整个文件的语法树

每行的语法树:

1

( function_def ( def def )( id test_loop )( -LRB- -LRB- )( id n )( -RRB- -RRB- )( : : ) )

1


1

1


1

1


整个文件的语法树:

1

代码:

用到的HashTable:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
//hashtable.cpp
#ifndef CPP_2018_HASH_MAP
#define CPP_2018_HASH_MAP

#include <vector>
#include <string>
using namespace std;

int operator % (const string & text, int max) {
int code = 0;
for (int i = 0; i < text.size(); ++ i) {
int code1 = text[i];
code1 = code1 << (i * 8 % 24);
code = code ^ code1;
}
return code % max;
}

class NoSuchKeyException {};

template <typename K, typename V>
class HashTable
{
private:

class Entry
{
public:
K key;
V value;
bool isInUse;

Entry() {
isInUse = false;
}
};

Entry * entries;

int capacity;
int count;

void initialize(int capacity2) {
count = 0;
capacity = capacity2;
entries = new Entry[capacity];
}

void assign(const HashTable & map2) {
count = map2.count;
capacity = map2.capacity;
entries = new Entry[capacity];
for (int i = 0; i < capacity; ++ i) {
entries[i] = map2.entries[i];
}
}

public:

HashTable() {
initialize(2);
}

~HashTable() {
delete [] entries;
}

HashTable(const HashTable & map2) {
assign(map2);
}

HashTable & operator = (const HashTable & map2) {
delete [] entries;
assign(map2);
return (*this);
}

void clear() {
delete [] entries;
initialize(2);
}

private:

int hashIndex(const K & key) const {
return key % capacity;
}

int find(const K & key) const {
int index = hashIndex(key);
while (true) {
if (! entries[index].isInUse) {
return index;
}
if (entries[index].key == key) {
return index;
}
index = (index + 1) % capacity;
}
}

void resize(int capacity2) {
Entry * entries0 = entries;
int capacity0 = capacity;
initialize(capacity2);
for (int i = 0; i < capacity0; ++ i) {
if (entries0[i].isInUse) {
put(entries0[i].key, entries0[i].value);
}
}
delete [] entries0;
}

public:

void put(const K & key, const V & value) {
int index = find(key);
entries[index].value = value;
if (entries[index].isInUse) return;

entries[index].isInUse = true;
entries[index].key = key;

++ count;
if (count > capacity / 2) {
resize(capacity * 2);
}
}

V get(const K & key) const {
int index = find(key);
if (! entries[index].isInUse) {
throw NoSuchKeyException();
}
return entries[index].value;
}

bool remove(const K & key) {
int index = find(key);
if (! entries[index].isInUse) return false;
fillNotInUseEntry(index);

-- count;
if (count < capacity / 4) {
resize(capacity / 2);
}
return true;
}

private:

void fillNotInUseEntry(int index) {
int next = index;
while (true) {
next = (next + 1) % capacity;
if (! entries[next].isInUse) {
entries[index].isInUse = false;
return;
}
int index0 = hashIndex(entries[next].key);
if (index < next) {
if (index0 > index &&
index0 <= next) continue;
}
else {
if (index0 > index ||
index0 <= next) continue;
}
entries[index] = entries[next];
index = next;
}
}

public:

bool containsKey(const K & key) const {
int index = find(key);
return (entries[index].isInUse);
}

int size() const {
return count;
}

vector<K> getKeys() const {
vector<K> vec;
for (int i = 0; i < capacity; ++ i) {
if (entries[i].isInUse) {
vec.push_back(entries[i].key);
}
}
return vec;
}

};

#endif

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
//optim.cpp

#ifndef CPP_2018_OPTIM
#define CPP_2018_OPTIM

#include <iostream>
#include <string>
#include <vector>
#include <sstream>
#include <stdexcept>
using namespace std;

#include "hashtable.cpp"
#include "token.cpp"

void optim_exp(Node & node) {
if (node.children.size() == 1) {
Node child = node.children[0];
node = child;
}
else if (node.children.size() == 3 && node.children[0].type == "(" && node.children[2].type == ")") {
Node child = node.children[1];
node = child;
}
if (startswith(node.type, "expression"))
node.type = "expression";
}

void optim(Node & node) {
for (int i = 0; i < node.children.size(); ++ i)
optim(node.children[i]);
if (startswith(node.type, "expression"))
optim_exp(node);
if (startswith(node.type, "operator_"))
node = node.children[0];
}

void optim(Line & line) {
optim(line.node);
for (int i = 0; i < line.children.size(); ++ i) {
optim(line.children[i]);
}
}

#endif


// #include <fstream>
// using namespace std;

// int main() {
// string filename = "test.py";
// stringstream ss;
// ss << "Cannot open file: " << filename;
// ifstream in(filename.c_str());
// if (in.fail()) throw runtime_error(ss.str());
// vector<string> lines;
// read_lines(in, lines);
// in.close();

// Line tree;
// get_tree(lines, tree);
// optim(tree);
// print_nodes(cout, tree);
// cout << tree << endl;
// }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
//token.cpp

#ifndef CPP_2018_TOKEN
#define CPP_2018_TOKEN

#include <iostream>
#include <string>
#include <sstream>
#include <vector>
#include <stdexcept>
using namespace std;

#include <cstring>
#include "hashtable.cpp"

struct Token
{
int line;
int pos;
string type;
string text;
};

int operator % (const Token & t, int base) {
return (t.line * 12347 + t.pos) % base;
}

bool operator == (const Token & t1, const Token & t2) {
return t1.line == t2.line && t1.pos == t2.pos;
}

ostream & operator << (ostream & out, const Token & t) {
out << t.line << '\t' << t.pos << '\t' << t.type << "\t\"" << t.text << "\"";
return out;
}

void read_lines(istream & in, vector<string> & lines) {
string line;
while (getline(in, line)) {
lines.push_back(line);
}
}

Token make_token(int line, int pos, int len, const string & source, const string & type) {
Token t;
t.line = line;
t.pos = pos;
t.type = type;
t.text = source.substr(pos, len);
return t;
}

////////////////////
// RULES BEGIN

Token find_indent(int line, const string & source) {
int len = 0;
for (int i = 0; i < source.size(); ++ i) {
if (source[i] == '\t')
++ len;
else
break;
}
return make_token(line, 0, len, source, "indent");
}

Token find_blank(int line, const string & source, int start) {
int len = 0;
for (int i = start; i < source.size(); ++ i) {
if (source[i] == '\t' || source[i] == ' ')
++ len;
else
break;
}
return make_token(line, start, len, source, "blank");
}

Token find_string(int line, const string & source, int start) {
int len = 0;
bool was_backslash = false;
for (int i = start; i < source.size(); ++ i) {
char c = source[i];
if (i == start) {
if (c == '"') ++ len;
else break;
}
else {
if (was_backslash) {
if (c == '\\' || c == 'r' || c == 'n' ||
c == 't' || c == '"')
++ len;
else break;
was_backslash = false;
}
else {
if (c == '"') {
++ len;
break;
}
++ len;
was_backslash = (c == '\\');
}
}
}
return make_token(line, start, len, source, "str");
}

Token find_int(int line, const string & source, int start) {
if (start < source.size()) {
if (source[start] == '0' && (start + 1 == source.size() || (source[start + 1] < '0' || source[start + 1] > '9')))
return make_token(line, start, 1, source, "int");
}
int len = 0;
for (int i = start; i < source.size(); ++ i) {
if ((i != start && source[i] >= '0' && source[i] <= '9' ) ||
(source[i] >= '1' && source[i] <= '9'))
++ len;
else
break;
}
return make_token(line, start, len, source, "int");
}

Token find_id(int line, const string & source, int start) {
int len = 0;
for (int i = start; i < source.size(); ++ i) {
if ((i != start && source[i] >= '0' && source[i] <= '9') ||
(source[i] >= 'A' && source[i] <= 'Z') ||
(source[i] >= 'a' && source[i] <= 'z') ||
source[i] == '_')
++ len;
else
break;
}
return make_token(line, start, len, source, "id");
}

const char *keywords[] = {
"def", "if", "else", "for", "in", "return", "yield", "pass",
"True", "False", "None",
"(", ")", ":",
"=", "+",
};

Token find_keyword(int line, const string & source, int start) {
int keyword = -1;
for (int i = 0; i < sizeof(keywords) / sizeof(const char *); ++ i) {
if (source.compare(start, strlen(keywords[i]), keywords[i]) == 0) {
keyword = i;
break;
}
}
if (keyword == -1)
return make_token(line, start, 0, source, "");
return make_token(line, start, strlen(keywords[keyword]), source, keywords[keyword]);
}

Token find_comment(int line, const string & source, int start) {
int len = 0;
for (int i = start; i < source.size(); ++ i) {
if (i != start || source[i] == '#')
++ len;
else
break;
}
return make_token(line, start, len, source, "comment");
}

////////////////////
// RULES END

typedef Token (* find_function)(int, const string &, int);

void add_tokens(HashTable<Token, Token> & tokens, int lineno, const string & line, find_function find, bool can_conflict) {
int start = 0;
while (start < line.size()) {
Token t = find(lineno, line, start);
if (t.text.size() > 0) {
if (can_conflict) {
if (tokens.containsKey(t)) {
Token old_t = tokens.get(t);
if (old_t.text.size() < t.text.size())
tokens.put(t, t);
}
else {
tokens.put(t, t);
}
}
else {
if (tokens.containsKey(t)) {
Token old_t = tokens.get(t);
stringstream ss;
ss << "Conflicting tokens:" << endl;
ss << old_t << endl;
ss << t << endl;
throw runtime_error(ss.str());
}
tokens.put(t, t);
}
start += t.text.size();
}
else
++ start;
}
}

void get_tokens(vector<string> & lines, vector<vector<Token> > & tokens) {
HashTable<Token, Token> all_tokens;
for (int i = 0; i < lines.size(); ++ i) {
Token indent = find_indent(i, lines[i]);
if (indent.text.size() > 0)
all_tokens.put(indent, indent);
add_tokens(all_tokens, i, lines[i], find_string, false);
add_tokens(all_tokens, i, lines[i], find_int, false);
add_tokens(all_tokens, i, lines[i], find_comment, false);
add_tokens(all_tokens, i, lines[i], find_keyword, false);
add_tokens(all_tokens, i, lines[i], find_id, true);
add_tokens(all_tokens, i, lines[i], find_blank, true);
// TODO: add more lexical matching functions
}
Token index;
for (int i = 0; i < lines.size(); ++ i) {
tokens.push_back(vector<Token>());
index.line = i;
index.pos = 0;
while (index.pos < lines[i].size()) {
if (! all_tokens.containsKey(index)) {
stringstream ss;
ss << "Lexical error at line #" << (index.line + 1) << " char #" << index.pos;
throw runtime_error(ss.str());
}
Token t = all_tokens.get(index);
if (t.type != "blank")
tokens[i].push_back(t);
index.pos += t.text.size();
}
}
}

#endif


// #include <fstream>
// using namespace std;

// int main() {
// string filename = "test.py";
// stringstream ss;
// ss << "Cannot open file: " << filename;
// ifstream in(filename.c_str());
// if (in.fail()) throw runtime_error(ss.str());
// vector<string> lines;
// read_lines(in, lines);
// in.close();

// vector<vector<Token> > tokens;
// get_tokens(lines, tokens);
// for (int i = 0; i < tokens.size(); ++ i)
// for (int j = 0; j < tokens[i].size(); ++ j)
// cout << tokens[i][j] << endl;
// }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
//tree.cpp

#ifndef CPP_2018_TREE
#define CPP_2018_TREE

#include <vector>
#include <sstream>
#include <stdexcept>
using namespace std;

#include "token.cpp"
#include "hashtable.cpp"

struct Node
{
int index;
int span;
string type;
vector<Node> children; // for non-terminal
Token token; // for terminal, as "indent" for non-terminal
};

bool operator == (const Node & n1, const Node & n2) {
return n1.index == n2.index && n1.span == n2.span && n1.type == n2.type;
}

string to_tree_text(string text) {
for (int i = 0; i < text.size(); ++ i)
if (text[i] == ' ') text[i] = '_';
if (text == "(") text = "-LRB-";
if (text == ")") text = "-RRB-";
return text;
}

void print_node(ostream & out, const Node & node, int indent=0) {
// for (int i = 0; i < indent; ++ i) out << '\t';
if (node.children.size() == 0) {
out << "( " << to_tree_text(node.type) << ' ' << to_tree_text(node.token.text) << " )";
}
else {
// out << node.type << "\tindex=" << node.index << "\tspan=" << node.span << endl;
out << "( " << to_tree_text(node.type) << " ";
for (int i = 0; i < node.children.size(); ++ i) {
print_node(out, node.children[i], indent + 1);
}
out << " )";
}
}

ostream & operator << (ostream & out, const Node & node) {
print_node(out, node, 0);
return out;
}

////////////////////
// RULES BEGIN

void make_node(const vector<Node> & node_list, vector<Node> & matched, const string & type, int len) {
if (node_list.size() != len) return;
Node node;
node.type = type;
for (int i = 0; i < len; ++ i)
node.children.push_back(node_list[i]);
matched.push_back(node);
}

void match_function_def(const vector<Node> & node_list, vector<Node> & matched) {
if (node_list.size() < 5) return;
if (node_list[0].type != "def") return;
if (node_list[1].type != "id") return;
if (node_list[2].type != "(") return;
int index = 3;
while (node_list[index].type == "id") {
++ index;
if (node_list.size() > index && node_list[index].type == ",") {
++ index;
if (node_list.size() <= index || node_list[index].type != "id") return;
}
else break;
}
if (node_list.size() <= index || node_list[index].type != ")") return;
++ index;
if (node_list.size() <= index || node_list[index].type != ":") return;
++ index;
make_node(node_list, matched, "function_def", index);
}

void match_expression(const vector<Node> & node_list, vector<Node> & matched) {
if (node_list.size() == 1) {
if (node_list[0].type == "function_call" ||
node_list[0].type == "str" ||
node_list[0].type == "id" ||
node_list[0].type == "int" ||
node_list[0].type == "None" ||
node_list[0].type == "True" ||
node_list[0].type == "False")
{
make_node(node_list, matched, "expression1", 1); // term
}
else if (node_list[0].type == "expression1") {
make_node(node_list, matched, "expression", 1); // factor
}
}
if (node_list.size() == 3) {
if (node_list[0].type == "(" && node_list[1].type == "expression" && node_list[2].type == ")") {
make_node(node_list, matched, "expression1", 3); // term
}
else if ((node_list[0].type == "expression1" || node_list[0].type == "expressionL") && node_list[1].type == "operator_add" && node_list[2].type == "expression1") {
make_node(node_list, matched, "expression", 3); // factor
make_node(node_list, matched, "expressionL", 3);
}
}
}

void match_function_call(const vector<Node> & node_list, vector<Node> & matched, int start=0) {
if (node_list.size() < 4) return;
if (node_list[0].type != "id") return;
if (node_list[1].type != "(") return;
int index = 2;
while (node_list[index].type == "expression") {
++ index;
if (node_list.size() > index && node_list[index].type == ",") {
++ index;
if (node_list.size() <= index || node_list[index].type != "expression") return;
}
else break;
}
if (node_list.size() <= index || node_list[index].type != ")") return;
++ index;
make_node(node_list, matched, "function_call", index);
}

void match_for(const vector<Node> & node_list, vector<Node> & matched, int start=0) {
if (node_list.size() != 5) return;
if (node_list[0].type != "for") return;
if (node_list[1].type != "id") return;
if (node_list[2].type != "in") return;
if (node_list[3].type != "expression") return;
if (node_list[4].type != ":") return;
make_node(node_list, matched, "for_statement", 5);
}

void match_if(const vector<Node> & node_list, vector<Node> & matched) {
if (node_list.size() != 3) return;
if (node_list[0].type != "if") return;
if (node_list[1].type != "expression") return;
if (node_list[2].type != ":") return;
make_node(node_list, matched, "if_statement", 3);
}

void match_else(const vector<Node> & node_list, vector<Node> & matched) {
if (node_list.size() != 2) return;
if (node_list[0].type != "else") return;
if (node_list[1].type != ":") return;
make_node(node_list, matched, "else_statement", 2);
}

void match_operator(const vector<Node> & node_list, vector<Node> & matched) {
if (node_list.size() != 1) return;
if (node_list[0].type == "+") {
make_node(node_list, matched, "operator_add", 1);
}
}

void match_assign(const vector<Node> & node_list, vector<Node> & matched) {
if (node_list.size() != 3) return;
if (node_list[0].type != "id") return;
if (node_list[1].type != "=") return;
if (node_list[2].type != "expression") return;
make_node(node_list, matched, "assign_statement", 3);
}

void match_yield(const vector<Node> & node_list, vector<Node> & matched) {
if (node_list.size() != 2) return;
if (node_list[0].type != "yield") return;
if (node_list[1].type != "expression") return;
make_node(node_list, matched, "yield_statement", 2);
}

void match_return(const vector<Node> & node_list, vector<Node> & matched) {
if (node_list.size() == 1) {
if (node_list[0].type != "return") return;
make_node(node_list, matched, "return_statement", 1);
}
else if (node_list.size() == 2) {
if (node_list[0].type != "return") return;
if (node_list[1].type != "expression") return;
make_node(node_list, matched, "return_statement", 2);
}
}

void match_pass(const vector<Node> & node_list, vector<Node> & matched) {
if (node_list.size() == 1) {
if (node_list[0].type != "pass") return;
make_node(node_list, matched, "pass_statement", 1);
}
}

////////////////////
// RULES END

vector<Node> check_rules(vector<Node> & node_list) {
vector<Node> matched;
match_function_def(node_list, matched);
match_expression(node_list, matched);
match_function_call(node_list, matched);
match_for(node_list, matched);
match_if(node_list, matched);
match_else(node_list, matched);
match_assign(node_list, matched);
match_yield(node_list, matched);
match_return(node_list, matched);
match_operator(node_list, matched);
match_pass(node_list, matched);
return matched;
}

void check_sequences(vector<vector<Node> > & nodes, vector<Node> & node_list,
int index, vector<Node> & to_check)
{
Node last_node = node_list[node_list.size() - 1];
int next_index = index + last_node.span;
// cout << next_index << ' ' << nodes.size() << endl;
if (next_index > nodes.size()) {
throw runtime_error("Internal error: incorrect next_index");
}
vector<Node> matched = check_rules(node_list);
for (int i = 0; i < matched.size(); ++ i) {
matched[i].index = matched[i].children[0].index;
matched[i].span = 0;
for (int j = 0; j < matched[i].children.size(); ++ j)
matched[i].span += matched[i].children[j].span;
for (int j = 0; j < to_check.size(); ++ j) {
if (to_check[j] == matched[i]) {
stringstream ss;
ss << "Internal error: repeated nodes:" << endl;
ss << to_check[j] << endl;
ss << matched[i] << endl;
throw runtime_error(ss.str());
}
}
to_check.push_back(matched[i]);
}
// for (int i = 0; i < matched.size(); ++ i) {
// cout << matched[i] << endl;
// }
if (next_index == nodes.size()) return;
for (int i = 0; i < nodes[next_index].size(); ++ i) {
Node node = nodes[next_index][i];
node_list.push_back(node);
check_sequences(nodes, node_list, next_index, to_check);
node_list.pop_back();
}
}

bool startswith(const string & text, const string & prefix) {
int size = prefix.size();
if (text.size() < size) return false;
string prefix2 = text.substr(0, size);
return prefix2 == prefix;
}

bool endswith(const string & text, const string & postfix) {
int size = postfix.size();
if (text.size() < size) return false;
string postfix2 = text.substr(text.size() - size, size);
return postfix2 == postfix;
}

Node parse_line(int lineno, vector<Token> & tokens) {
vector<vector<Node> > nodes;
vector<Node> to_check;
int span = tokens.size();
for (int i = 0; i < tokens.size(); ++ i) {
if (tokens[i].type == "indent") {
-- span;
continue;
}
Node node;
node.type = tokens[i].type;
node.index = nodes.size();
node.span = 1;
node.token = tokens[i];
nodes.push_back(vector<Node>());
to_check.push_back(node);
}
while (to_check.size() > 0) {
Node first = to_check[to_check.size() - 1];
// cout << "--->" << '\t' << first << endl;
to_check.pop_back();
vector<Node> node_list;
node_list.push_back(first);
nodes[first.index].push_back(first);
check_sequences(nodes, node_list, first.index, to_check);
}
int result = -1;
for (int i = 0; i < nodes[0].size(); ++ i) {
if (nodes[0][i].span == span)
result = i;
}
if (result == -1) {
stringstream ss;
ss << "Syntax error in line: " << (lineno + 1);
throw runtime_error(ss.str());
}
return nodes[0][result];
}

struct Line
{
int level;
Node node;
vector<Line> children;
};

void print_line(ostream & out, const Line & line, int indent=0) {
// for (int i = 0; i < indent; ++ i) out << '\t';
if (line.children.size() == 0) {
// out << line.node.type << '\t' << line.node.token.text << endl;
out << "( " << line.node.type << " _ )";
}
else {
out << "( " << line.node.type << " ";
for (int i = 0; i < line.children.size(); ++ i) {
print_line(out, line.children[i], indent + 1);
}
out << " )";
}
}

ostream & operator << (ostream & out, const Line & line) {
print_line(out, line, 0);
return out;
}

void indent_error(const Token & indent) {
stringstream ss;
ss << "Inconsistent indentation at line: " << indent.line;
throw runtime_error(ss.str());
}

void get_tree(vector<string> & lines, Line & tree) {
vector<vector<Token> > tokens;
vector<Node> nodes;
vector<Token> indents;
get_tokens(lines, tokens);
for (int i = 0; i < tokens.size(); ++ i) {
if (tokens[i].size() > 0) {
if (tokens[i][0].type == "indent")
indents.push_back(tokens[i][0]);
else
indents.push_back(Token());
nodes.push_back(parse_line(i, tokens[i]));
}
}
tree.level = 0;
tree.node.type = "__main__";
tree.node.token.text = "__main__";
tree.node.index = 0;
tree.node.span = 0;
vector<Line *> stack;
vector<string> indent_stack;
stack.push_back(&tree);
indent_stack.push_back("");
for (int i = 0; i < nodes.size(); ++ i) {
string last_indent = indent_stack[indent_stack.size() - 1];
Line next;
next.node = nodes[i];
Line * parent = stack[stack.size() - 1];
if (indents[i].text == last_indent) {
next.level = parent->level + 1;
parent->children.push_back(next);
}
else {
if (startswith(indents[i].text, last_indent)) {
indent_stack.push_back(indents[i].text);
if (parent->children.size() == 0) indent_error(indents[i]);
parent = &parent->children[parent->children.size() - 1];
stack.push_back(parent);
next.level = parent->level + 1;
parent->children.push_back(next);
}
else {
while (true) {
if (indents[i].text == last_indent) {
next.level = parent->level + 1;
parent->children.push_back(next);
break;
}
if (startswith(last_indent, indents[i].text)) {
indent_stack.pop_back();
last_indent = indent_stack[indent_stack.size() - 1];
stack.pop_back();
if (stack.size() == 0) indent_error(indents[i]);
parent = stack[stack.size() - 1];
}
else {
indent_error(indents[i]);
}
}
}
}
}
}

void print_nodes(ostream & out, const Line & line) {
if (line.node.type != "__main__")
out << line.node << endl;
if (line.children.size() == 0) return;
for (int i = 0; i < line.children.size(); ++ i) {
print_nodes(out, line.children[i]);
}
}

#endif


#include <fstream>
using namespace std;

#include "optim.cpp"

void assert_true(bool test, const string & err_msg) {
if (! test) throw runtime_error(err_msg);
}

void read_file(string filename, vector<string> & lines) {
stringstream ss;
ss << "Cannot open file: " << filename;
ifstream in(filename.c_str());
if (in.fail()) throw runtime_error(ss.str());
read_lines(in, lines);
in.close();
}

int main(int argc, char * argv[]) {
try {
//assert_true(argc == 2, "Expect an argument");
vector<string> lines;
//read_file(argv[1], lines);
read_file("test1.py", lines);
Line tree;
get_tree(lines, tree);
optim(tree);
print_nodes(cout, tree);
cout << tree << endl;
}
catch (runtime_error & ex) {
cout << ex.what() << endl;
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#test1.py

def test_loop(n):
for i in range(n):
print(1 + i)

def test_condition(c):
if c:
print("yes")
else:
print("no")

test_loop(10)
test_condition(True)

1

Donate? comment?