Path: blob/main/third_party/ply/example/GardenSnake/GardenSnake.py
7087 views
# GardenSnake - a parser generator demonstration program1#2# This implements a modified version of a subset of Python:3# - only 'def', 'return' and 'if' statements4# - 'if' only has 'then' clause (no elif nor else)5# - single-quoted strings only, content in raw format6# - numbers are decimal.Decimal instances (not integers or floats)7# - no print statment; use the built-in 'print' function8# - only < > == + - / * implemented (and unary + -)9# - assignment and tuple assignment work10# - no generators of any sort11# - no ... well, no quite a lot1213# Why? I'm thinking about a new indentation-based configuration14# language for a project and wanted to figure out how to do it. Once15# I got that working I needed a way to test it out. My original AST16# was dumb so I decided to target Python's AST and compile it into17# Python code. Plus, it's pretty cool that it only took a day or so18# from sitting down with Ply to having working code.1920# This uses David Beazley's Ply from http://www.dabeaz.com/ply/2122# This work is hereby released into the Public Domain. To view a copy of23# the public domain dedication, visit24# http://creativecommons.org/licenses/publicdomain/ or send a letter to25# Creative Commons, 543 Howard Street, 5th Floor, San Francisco,26# California, 94105, USA.27#28# Portions of this work are derived from Python's Grammar definition29# and may be covered under the Python copyright and license30#31# Andrew Dalke / Dalke Scientific Software, LLC32# 30 August 2006 / Cape Town, South Africa3334# Changelog:35# 30 August - added link to CC license; removed the "swapcase" encoding3637# Modifications for inclusion in PLY distribution38import sys39sys.path.insert(0,"../..")40from ply import *4142##### Lexer ######43#import lex44import decimal4546tokens = (47'DEF',48'IF',49'NAME',50'NUMBER', # Python decimals51'STRING', # single quoted strings only; syntax of raw strings52'LPAR',53'RPAR',54'COLON',55'EQ',56'ASSIGN',57'LT',58'GT',59'PLUS',60'MINUS',61'MULT',62'DIV',63'RETURN',64'WS',65'NEWLINE',66'COMMA',67'SEMICOLON',68'INDENT',69'DEDENT',70'ENDMARKER',71)7273#t_NUMBER = r'\d+'74# taken from decmial.py but without the leading sign75def t_NUMBER(t):76r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""77t.value = decimal.Decimal(t.value)78return t7980def t_STRING(t):81r"'([^\\']+|\\'|\\\\)*'" # I think this is right ...82t.value=t.value[1:-1].decode("string-escape") # .swapcase() # for fun83return t8485t_COLON = r':'86t_EQ = r'=='87t_ASSIGN = r'='88t_LT = r'<'89t_GT = r'>'90t_PLUS = r'\+'91t_MINUS = r'-'92t_MULT = r'\*'93t_DIV = r'/'94t_COMMA = r','95t_SEMICOLON = r';'9697# Ply nicely documented how to do this.9899RESERVED = {100"def": "DEF",101"if": "IF",102"return": "RETURN",103}104105def t_NAME(t):106r'[a-zA-Z_][a-zA-Z0-9_]*'107t.type = RESERVED.get(t.value, "NAME")108return t109110# Putting this before t_WS let it consume lines with only comments in111# them so the latter code never sees the WS part. Not consuming the112# newline. Needed for "if 1: #comment"113def t_comment(t):114r"[ ]*\043[^\n]*" # \043 is '#'115pass116117118# Whitespace119def t_WS(t):120r' [ ]+ '121if t.lexer.at_line_start and t.lexer.paren_count == 0:122return t123124# Don't generate newline tokens when inside of parenthesis, eg125# a = (1,126# 2, 3)127def t_newline(t):128r'\n+'129t.lexer.lineno += len(t.value)130t.type = "NEWLINE"131if t.lexer.paren_count == 0:132return t133134def t_LPAR(t):135r'\('136t.lexer.paren_count += 1137return t138139def t_RPAR(t):140r'\)'141# check for underflow? should be the job of the parser142t.lexer.paren_count -= 1143return t144145146def t_error(t):147raise SyntaxError("Unknown symbol %r" % (t.value[0],))148print "Skipping", repr(t.value[0])149t.lexer.skip(1)150151## I implemented INDENT / DEDENT generation as a post-processing filter152153# The original lex token stream contains WS and NEWLINE characters.154# WS will only occur before any other tokens on a line.155156# I have three filters. One tags tokens by adding two attributes.157# "must_indent" is True if the token must be indented from the158# previous code. The other is "at_line_start" which is True for WS159# and the first non-WS/non-NEWLINE on a line. It flags the check so160# see if the new line has changed indication level.161162# Python's syntax has three INDENT states163# 0) no colon hence no need to indent164# 1) "if 1: go()" - simple statements have a COLON but no need for an indent165# 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent166NO_INDENT = 0167MAY_INDENT = 1168MUST_INDENT = 2169170# only care about whitespace at the start of a line171def track_tokens_filter(lexer, tokens):172lexer.at_line_start = at_line_start = True173indent = NO_INDENT174saw_colon = False175for token in tokens:176token.at_line_start = at_line_start177178if token.type == "COLON":179at_line_start = False180indent = MAY_INDENT181token.must_indent = False182183elif token.type == "NEWLINE":184at_line_start = True185if indent == MAY_INDENT:186indent = MUST_INDENT187token.must_indent = False188189elif token.type == "WS":190assert token.at_line_start == True191at_line_start = True192token.must_indent = False193194else:195# A real token; only indent after COLON NEWLINE196if indent == MUST_INDENT:197token.must_indent = True198else:199token.must_indent = False200at_line_start = False201indent = NO_INDENT202203yield token204lexer.at_line_start = at_line_start205206def _new_token(type, lineno):207tok = lex.LexToken()208tok.type = type209tok.value = None210tok.lineno = lineno211return tok212213# Synthesize a DEDENT tag214def DEDENT(lineno):215return _new_token("DEDENT", lineno)216217# Synthesize an INDENT tag218def INDENT(lineno):219return _new_token("INDENT", lineno)220221222# Track the indentation level and emit the right INDENT / DEDENT events.223def indentation_filter(tokens):224# A stack of indentation levels; will never pop item 0225levels = [0]226token = None227depth = 0228prev_was_ws = False229for token in tokens:230## if 1:231## print "Process", token,232## if token.at_line_start:233## print "at_line_start",234## if token.must_indent:235## print "must_indent",236237238# WS only occurs at the start of the line239# There may be WS followed by NEWLINE so240# only track the depth here. Don't indent/dedent241# until there's something real.242if token.type == "WS":243assert depth == 0244depth = len(token.value)245prev_was_ws = True246# WS tokens are never passed to the parser247continue248249if token.type == "NEWLINE":250depth = 0251if prev_was_ws or token.at_line_start:252# ignore blank lines253continue254# pass the other cases on through255yield token256continue257258# then it must be a real token (not WS, not NEWLINE)259# which can affect the indentation level260261prev_was_ws = False262if token.must_indent:263# The current depth must be larger than the previous level264if not (depth > levels[-1]):265raise IndentationError("expected an indented block")266267levels.append(depth)268yield INDENT(token.lineno)269270elif token.at_line_start:271# Must be on the same level or one of the previous levels272if depth == levels[-1]:273# At the same level274pass275elif depth > levels[-1]:276raise IndentationError("indentation increase but not in new block")277else:278# Back up; but only if it matches a previous level279try:280i = levels.index(depth)281except ValueError:282raise IndentationError("inconsistent indentation")283for _ in range(i+1, len(levels)):284yield DEDENT(token.lineno)285levels.pop()286287yield token288289### Finished processing ###290291# Must dedent any remaining levels292if len(levels) > 1:293assert token is not None294for _ in range(1, len(levels)):295yield DEDENT(token.lineno)296297298# The top-level filter adds an ENDMARKER, if requested.299# Python's grammar uses it.300def filter(lexer, add_endmarker = True):301token = None302tokens = iter(lexer.token, None)303tokens = track_tokens_filter(lexer, tokens)304for token in indentation_filter(tokens):305yield token306307if add_endmarker:308lineno = 1309if token is not None:310lineno = token.lineno311yield _new_token("ENDMARKER", lineno)312313# Combine Ply and my filters into a new lexer314315class IndentLexer(object):316def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):317self.lexer = lex.lex(debug=debug, optimize=optimize, lextab=lextab, reflags=reflags)318self.token_stream = None319def input(self, s, add_endmarker=True):320self.lexer.paren_count = 0321self.lexer.input(s)322self.token_stream = filter(self.lexer, add_endmarker)323def token(self):324try:325return self.token_stream.next()326except StopIteration:327return None328329########## Parser (tokens -> AST) ######330331# also part of Ply332#import yacc333334# I use the Python AST335from compiler import ast336337# Helper function338def Assign(left, right):339names = []340if isinstance(left, ast.Name):341# Single assignment on left342return ast.Assign([ast.AssName(left.name, 'OP_ASSIGN')], right)343elif isinstance(left, ast.Tuple):344# List of things - make sure they are Name nodes345names = []346for child in left.getChildren():347if not isinstance(child, ast.Name):348raise SyntaxError("that assignment not supported")349names.append(child.name)350ass_list = [ast.AssName(name, 'OP_ASSIGN') for name in names]351return ast.Assign([ast.AssTuple(ass_list)], right)352else:353raise SyntaxError("Can't do that yet")354355356# The grammar comments come from Python's Grammar/Grammar file357358## NB: compound_stmt in single_input is followed by extra NEWLINE!359# file_input: (NEWLINE | stmt)* ENDMARKER360def p_file_input_end(p):361"""file_input_end : file_input ENDMARKER"""362p[0] = ast.Stmt(p[1])363def p_file_input(p):364"""file_input : file_input NEWLINE365| file_input stmt366| NEWLINE367| stmt"""368if isinstance(p[len(p)-1], basestring):369if len(p) == 3:370p[0] = p[1]371else:372p[0] = [] # p == 2 --> only a blank line373else:374if len(p) == 3:375p[0] = p[1] + p[2]376else:377p[0] = p[1]378379380# funcdef: [decorators] 'def' NAME parameters ':' suite381# ignoring decorators382def p_funcdef(p):383"funcdef : DEF NAME parameters COLON suite"384p[0] = ast.Function(None, p[2], tuple(p[3]), (), 0, None, p[5])385386# parameters: '(' [varargslist] ')'387def p_parameters(p):388"""parameters : LPAR RPAR389| LPAR varargslist RPAR"""390if len(p) == 3:391p[0] = []392else:393p[0] = p[2]394395396# varargslist: (fpdef ['=' test] ',')* ('*' NAME [',' '**' NAME] | '**' NAME) |397# highly simplified398def p_varargslist(p):399"""varargslist : varargslist COMMA NAME400| NAME"""401if len(p) == 4:402p[0] = p[1] + p[3]403else:404p[0] = [p[1]]405406# stmt: simple_stmt | compound_stmt407def p_stmt_simple(p):408"""stmt : simple_stmt"""409# simple_stmt is a list410p[0] = p[1]411412def p_stmt_compound(p):413"""stmt : compound_stmt"""414p[0] = [p[1]]415416# simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE417def p_simple_stmt(p):418"""simple_stmt : small_stmts NEWLINE419| small_stmts SEMICOLON NEWLINE"""420p[0] = p[1]421422def p_small_stmts(p):423"""small_stmts : small_stmts SEMICOLON small_stmt424| small_stmt"""425if len(p) == 4:426p[0] = p[1] + [p[3]]427else:428p[0] = [p[1]]429430# small_stmt: expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt |431# import_stmt | global_stmt | exec_stmt | assert_stmt432def p_small_stmt(p):433"""small_stmt : flow_stmt434| expr_stmt"""435p[0] = p[1]436437# expr_stmt: testlist (augassign (yield_expr|testlist) |438# ('=' (yield_expr|testlist))*)439# augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |440# '<<=' | '>>=' | '**=' | '//=')441def p_expr_stmt(p):442"""expr_stmt : testlist ASSIGN testlist443| testlist """444if len(p) == 2:445# a list of expressions446p[0] = ast.Discard(p[1])447else:448p[0] = Assign(p[1], p[3])449450def p_flow_stmt(p):451"flow_stmt : return_stmt"452p[0] = p[1]453454# return_stmt: 'return' [testlist]455def p_return_stmt(p):456"return_stmt : RETURN testlist"457p[0] = ast.Return(p[2])458459460def p_compound_stmt(p):461"""compound_stmt : if_stmt462| funcdef"""463p[0] = p[1]464465def p_if_stmt(p):466'if_stmt : IF test COLON suite'467p[0] = ast.If([(p[2], p[4])], None)468469def p_suite(p):470"""suite : simple_stmt471| NEWLINE INDENT stmts DEDENT"""472if len(p) == 2:473p[0] = ast.Stmt(p[1])474else:475p[0] = ast.Stmt(p[3])476477478def p_stmts(p):479"""stmts : stmts stmt480| stmt"""481if len(p) == 3:482p[0] = p[1] + p[2]483else:484p[0] = p[1]485486## No using Python's approach because Ply supports precedence487488# comparison: expr (comp_op expr)*489# arith_expr: term (('+'|'-') term)*490# term: factor (('*'|'/'|'%'|'//') factor)*491# factor: ('+'|'-'|'~') factor | power492# comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'493494def make_lt_compare((left, right)):495return ast.Compare(left, [('<', right),])496def make_gt_compare((left, right)):497return ast.Compare(left, [('>', right),])498def make_eq_compare((left, right)):499return ast.Compare(left, [('==', right),])500501502binary_ops = {503"+": ast.Add,504"-": ast.Sub,505"*": ast.Mul,506"/": ast.Div,507"<": make_lt_compare,508">": make_gt_compare,509"==": make_eq_compare,510}511unary_ops = {512"+": ast.UnaryAdd,513"-": ast.UnarySub,514}515precedence = (516("left", "EQ", "GT", "LT"),517("left", "PLUS", "MINUS"),518("left", "MULT", "DIV"),519)520521def p_comparison(p):522"""comparison : comparison PLUS comparison523| comparison MINUS comparison524| comparison MULT comparison525| comparison DIV comparison526| comparison LT comparison527| comparison EQ comparison528| comparison GT comparison529| PLUS comparison530| MINUS comparison531| power"""532if len(p) == 4:533p[0] = binary_ops[p[2]]((p[1], p[3]))534elif len(p) == 3:535p[0] = unary_ops[p[1]](p[2])536else:537p[0] = p[1]538539# power: atom trailer* ['**' factor]540# trailers enables function calls. I only allow one level of calls541# so this is 'trailer'542def p_power(p):543"""power : atom544| atom trailer"""545if len(p) == 2:546p[0] = p[1]547else:548if p[2][0] == "CALL":549p[0] = ast.CallFunc(p[1], p[2][1], None, None)550else:551raise AssertionError("not implemented")552553def p_atom_name(p):554"""atom : NAME"""555p[0] = ast.Name(p[1])556557def p_atom_number(p):558"""atom : NUMBER559| STRING"""560p[0] = ast.Const(p[1])561562def p_atom_tuple(p):563"""atom : LPAR testlist RPAR"""564p[0] = p[2]565566# trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME567def p_trailer(p):568"trailer : LPAR arglist RPAR"569p[0] = ("CALL", p[2])570571# testlist: test (',' test)* [',']572# Contains shift/reduce error573def p_testlist(p):574"""testlist : testlist_multi COMMA575| testlist_multi """576if len(p) == 2:577p[0] = p[1]578else:579# May need to promote singleton to tuple580if isinstance(p[1], list):581p[0] = p[1]582else:583p[0] = [p[1]]584# Convert into a tuple?585if isinstance(p[0], list):586p[0] = ast.Tuple(p[0])587588def p_testlist_multi(p):589"""testlist_multi : testlist_multi COMMA test590| test"""591if len(p) == 2:592# singleton593p[0] = p[1]594else:595if isinstance(p[1], list):596p[0] = p[1] + [p[3]]597else:598# singleton -> tuple599p[0] = [p[1], p[3]]600601602# test: or_test ['if' or_test 'else' test] | lambdef603# as I don't support 'and', 'or', and 'not' this works down to 'comparison'604def p_test(p):605"test : comparison"606p[0] = p[1]607608609610# arglist: (argument ',')* (argument [',']| '*' test [',' '**' test] | '**' test)611# XXX INCOMPLETE: this doesn't allow the trailing comma612def p_arglist(p):613"""arglist : arglist COMMA argument614| argument"""615if len(p) == 4:616p[0] = p[1] + [p[3]]617else:618p[0] = [p[1]]619620# argument: test [gen_for] | test '=' test # Really [keyword '='] test621def p_argument(p):622"argument : test"623p[0] = p[1]624625def p_error(p):626#print "Error!", repr(p)627raise SyntaxError(p)628629630class GardenSnakeParser(object):631def __init__(self, lexer = None):632if lexer is None:633lexer = IndentLexer()634self.lexer = lexer635self.parser = yacc.yacc(start="file_input_end")636637def parse(self, code):638self.lexer.input(code)639result = self.parser.parse(lexer = self.lexer)640return ast.Module(None, result)641642643###### Code generation ######644645from compiler import misc, syntax, pycodegen646647class GardenSnakeCompiler(object):648def __init__(self):649self.parser = GardenSnakeParser()650def compile(self, code, filename="<string>"):651tree = self.parser.parse(code)652#print tree653misc.set_filename(filename, tree)654syntax.check(tree)655gen = pycodegen.ModuleCodeGenerator(tree)656code = gen.getCode()657return code658659####### Test code #######660661compile = GardenSnakeCompiler().compile662663code = r"""664665print('LET\'S TRY THIS \\OUT')666667#Comment here668def x(a):669print('called with',a)670if a == 1:671return 2672if a*2 > 10: return 999 / 4673# Another comment here674675return a+2*3676677ints = (1, 2,6783, 4,6795)680print('mutiline-expression', ints)681682t = 4+1/3*2+6*(9-5+1)683print('predence test; should be 34+2/3:', t, t==(34+2/3))684685print('numbers', 1,2,3,4,5)686if 1:6878688a=9689print(x(a))690691print(x(1))692print(x(2))693print(x(8),'3')694print('this is decimal', 1/5)695print('BIG DECIMAL', 1.234567891234567e12345)696697"""698699# Set up the GardenSnake run-time environment700def print_(*args):701print "-->", " ".join(map(str,args))702703globals()["print"] = print_704705compiled_code = compile(code)706707exec compiled_code in globals()708print "Done"709710711