// First, a parser for right-associative + expressions: // "1 + 2 + 3" ---> 1 + (2 + 3) // Parse an atom (in this case, an integer literal) and return it, or NULL if // the current token isn't an atom. Ast *parse_atom(void); Ast *parse_expr(void) { Ast *node = parse_atom(); if (!node) { return 0; } if (peek().type == '+') { next(); Ast *right = parse_expr(); node = make_binary_node('+', node, right); } return node; } Ast *parse(void) { Ast *node = parse_expr(); assert(peek().type == TokenEOF); return node; } // Add parentheses: // "1 + 2 + 3" ---> 1 + (2 + 3) // "(1 + 2) + 3" ---> (1 + 2) + 3 Ast *parse_expr(void) { Ast *node = 0; if (peek().type == '(') { next(); node = parse_expr(); assert(node); assert(peek().type == ')'); next(); } else { node = parse_atom(); if (!node) { return 0; } } if (peek().type == '+') { next(); Ast *right = parse_expr(); node = make_binary_node('+', node, right); } return node; } Ast *parse(void) { Ast *node = parse_expr(); assert(peek().type == TokenEOF); return node; } // Now, a parser for left-associative + with parentheses. // "1 + 2 + 3" ---> (1 + 2) + 3 // "1 + ((2) + 3)" ---> 1 + (2 + 3) // To do this, we parse with a loop: // When we see a +, we call parse_expr() recursively, but tell it to only parse // until the next +. Then we keep going as long as the next token is a +. Ast *parse_expr(bool stop_at_plus) { Ast *node = 0; if (peek().type == '(') { next(); node = parse_expr(false); assert(node); assert(peek().type == ')'); next(); } else { node = parse_atom(); if (!node) { return 0; } } while (1) { Token op = peek(); if (op.type == '+' && !stop_at_plus) { next(); Ast *right = parse_expr(true); node = make_binary_node('+', node, right); } else { break; } } return node; } Ast *parse(void) { Ast *node = parse_expr(false); assert(peek().type == TokenEOF); return node; } // Now we add precedence. We give each operator a "stickiness": // "2 + 3 * 4 + 5" ---> (2 + (3 * 4)) + 5 // // * is stickier than +, so let's say * has stickiness 2 and + has stickiness 1. // Say our parser is here after seeing the first +: // 2 +|3 * 4 + 5 // When we call parse_expr recursively, we want it to parse the 3 * 4 // expression -- because * is stickier than + -- but stop at the +, like before. // So we pass a "minimum stickiness" argument, and stop as soon as we see an operator // less sticky than that. This is a more general version of stop_at_plus from before. Ast *parse_expr(int min_stickiness) { Ast *node = 0; if (peek().type == '(') { next(); node = parse_expr(0); assert(node); assert(peek().type == ')'); next(); } else { node = parse_atom(); if (!node) { return 0; } } while (1) { Token op = peek(); if (op.type == '+' && min_stickiness <= 1) { next(); Ast *right = parse_expr(2); node = make_binary_node('+', node, right); } else if (op.type == '*' && min_stickiness <= 2) { next(); Ast *right = parse_expr(3); node = make_binary_node('*', node, right); } else { break; } } return node; } Ast *parse(void) { Ast *node = parse_expr(0); assert(peek().type == TokenEOF); return node; } // This is already all it takes to do precedence parsing. But there's // a bit of code duplication, so let's extract that. // Return an operator's stickiness, or -1 if a token isn't a binary operator. int operator_stickiness(TokenType type); Ast *parse_expr(int min_stickiness) { Ast *node = 0; if (peek().type == '(') { next(); node = parse_expr(0); assert(node); assert(peek().type == ')'); next(); } else { node = parse_atom(); if (!node) { return 0; } } while (1) { Token op = peek(); int op_stickiness = operator_stickiness(op.type); if (op_stickiness < min_stickiness) { break; } next(); Ast *right = parse_expr(op_stickiness + 1); node = make_binary_node(op.type, node, right); } return node; } Ast *parse(void) { Ast *node = parse_expr(0); assert(peek().type == TokenEOF); return node; } // Right-associativity is very similar to left-associativity. // Say you have right-associative exponentiation, // "2 ^ 3 ^ 4" ---> 2 ^ (3 ^ 4) // The only change is whether, when parsing the expression to the right of the // first ^, // "2 ^|3 ^ 4", // We stop when we see another ^, or only at lower stickiness levels. // That is, we adjust the min_stickiness of the recursive call. bool is_left_associative(TokenType type); Ast *parse_expr(int min_stickiness) { Ast *node = 0; if (peek().type == '(') { next(); node = parse_expr(0); assert(node); assert(peek().type == ')'); next(); } else { node = parse_atom(); if (!node) { return 0; } } while (1) { Token op = peek(); int op_stickiness = operator_stickiness(op.type); if (op_stickiness < min_stickiness) { break; } next(); int adjustment = is_left_associative(op.type) ? 1 : 0; Ast *right = parse_expr(op_stickiness + adjustment); node = make_binary_node(op.type, node, right); } return node; } Ast *parse(void) { Ast *node = parse_expr(0); assert(peek().type == TokenEOF); return node; } // That's all it takes to write a precedence parser. // This is much simpler than the typical system that tries to look like a BNF // grammar. This algorithm is sometimes called "precedence climbing"; a variant // of it is called "Pratt parsing". If you use your own stack instead of the // call stack, it's called the "shunting-yard algorithm". It also has other // names. // It's also easy to extend. Exercises: // * Add unary prefix operators (like unary -). // * Add unary postfix operators (like ! for factorial). // * Add mixfix operators, like C's a ? b : c. // Agda-style mixfix operators, driven entirely by a lookup table, are also simple. // Note that you can even view parentheses as operators.