Source code for deepfold.runner.parser

#!/usr/bin/env python3


import sys

import ply.lex as lex
import ply.yacc as yacc

RE_ID = r"[a-zA-Z_][a-zA-Z_0-9]*"
RE_INT = r"[1-9][0-9]*"

TOKENS = [
    # Initial state tokens
    "BEGIN_ARRAY",
    "BEGIN_OBJECT",
    "END_ARRAY",
    "END_OBJECT",
    "NAME_SEPARATOR",
    "VALUE_SEPARATOR",
    "QUOTATION_MARK",
    "FALSE",
    "TRUE",
    "NULL",
    "DECIMAL_POINT",
    "DIGITS",
    "E",
    "MINUS",
    "PLUS",
    "ZERO",
    # String state tokens
    "UNESCAPED",
    "ESCAPE",
    # Escaped state tokens
    "REVERSE_SOLIDUS",
    "SOLIDUS",
    "BACKSPACE_CHAR",
    "FORM_FEED_CHAR",
    "LINE_FEED_CHAR",
    "CARRIAGE_RETURN_CHAR",
    "TAB_CHAR",
    "UNICODE_HEX",
    # Graph
    "BEGIN_EDGE_LIST",  # [[
    "END_EDGE_LIST",  # ]]
    "EDGE_SEP",  # -
    # Predict
    "STOI_SEP",
    "NUM_SYM_SEP",
    # "STOI_NUM",
    # Identifier
    "ID",
    "SEMICOLON",
    "NEWLINE",
]

RESERVED = {
    # Types
    "model": "MODEL",
    "entity": "ENTITY",
    "predict": "PREDICT",
    # Pred
    "using": "USING",
    "stoi": "STOI",
    "pair": "PAIR",
    "sample": "SAMPLE",
    # Varibles
    "graph": "GRAPH",
    "let": "LET",
    "in": "IN",
}

TOKENS += RESERVED.values()


[docs] class Lexer: def __init__(self, debug=False, **kwargs): self.lexer = lex.lex( module=self, debug=debug, **kwargs, ) tokens = TOKENS states = ( ("iterable", "inclusive"), ("string", "exclusive"), ("escaped", "exclusive"), ("graph", "inclusive"), ("stoi", "exclusive"), )
[docs] def t_ANY_error(self, t): last_cr = self.lexer.lexdata.rfind("\n", 0, t.lexpos) if last_cr < 0: last_cr = 0 column = t.lexpos - last_cr + 1 print(f"Illegal character '{t.value[0]}' at line {t.lineno} pos {column}") t.lexer.skip(1)
t_SEMICOLON = r"\x3B" # Count newline
[docs] def t_NEWLINE(self, t): r"\n+" t.lexer.lineno += len(t.value) return t
# Skips over '\s', '\t', and '\r' characters in the default state t_ignore = "\x20\x09\x0D"
[docs] @lex.TOKEN(RE_ID) def t_ID(self, t): if t.value in RESERVED: t.type = RESERVED[t.value] if t.value == "graph": t.lexer.push_state("graph") if t.value == "stoi": t.lexer.push_state("stoi") return t
# Iterable
[docs] def t_iterable_NEWLINE(self, t): r"\n+" t.lexer.lineno += len(t.value)
# t_BEGIN_ARRAY = r"\x5B" # '[' # t_BEGIN_OBJECT = r"\x7B" # '{'
[docs] def t_BEGIN_ARRAY(self, t): r"\x5B" t.lexer.push_state("iterable") return t
[docs] def t_BEGIN_OBJECT(self, t): r"\x7B" t.lexer.push_state("iterable") return t
[docs] def t_iterable_END_ARRAY(self, t): r"\x5D" t.lexer.pop_state() return t
[docs] def t_iterable_END_OBJECT(self, t): r"\x7D" t.lexer.pop_state() return t
# t_END_ARRAY = r"\x5D" # ']' # t_END_OBJECT = r"\x7D" # '}' # Default state tokens t_NAME_SEPARATOR = r"\x3A" # ':' t_VALUE_SEPARATOR = r"\x2C" # ',' t_FALSE = r"\x66\x61\x6c\x73\x65" # 'false' t_TRUE = r"\x74\x72\x75\x65" # 'true' t_NULL = r"\x6e\x75\x6c\x6c" # 'null' t_DECIMAL_POINT = r"\x2E" # '.' t_DIGITS = r"[\x30-\x39]+" # '0'..'9' t_E = r"[\x45\x65]" # 'e' or 'E' t_MINUS = r"\x2D" # '-' t_PLUS = r"\x2B" # '+' t_ZERO = r"\x30" # '0'
[docs] def t_ignore_COMMENT(self, t): r"\#.*"
# Graph
[docs] @lex.TOKEN(RE_ID) def t_graph_ID(self, t): if t.value in RESERVED: t.type = RESERVED[t.value] return t
[docs] def t_graph_BEGIN_EDGE_LIST(self, t): r"\x5B\x5B" # [[ return t
[docs] def t_graph_EDGE_SEP(self, t): r"\x2D" return t
t_graph_VALUE_SEPARATOR = "\x2C" # ',' t_graph_ignore = "\x20\x09\x0D" # Ignore \n
[docs] def t_graph_NEWLINE(self, t): r"\n+" t.lexer.lineno += len(t.value)
[docs] def t_graph_END_EDGE_LIST(self, t): r"\x5D\x5D" # ]] t.lexer.pop_state() return t
# Enters the string state on an opening quotation mark
[docs] def t_QUOTATION_MARK(self, t): r"\x22" # '"' t.lexer.push_state("string") return t
# Don't skip over any tokens inside the string state t_string_ignore = "" # TODO(dewitt): Verify that this matches the correct range, the spec # says '%x5D-10FFFF' but most pythons by default will not handle that
[docs] def t_string_UNESCAPED(self, t): r"[\x20-\x21,\x23-\x5B,\x5D-\xFF]+" t.value = str(t.value) return t
# Exits the string state on an unescaped closing quotation mark
[docs] def t_string_QUOTATION_MARK(self, t): r"\x22" # '"' t.lexer.pop_state() return t
# Enter the escaped state on a '\' character
[docs] def t_string_ESCAPE(self, t): r"\x5C" # '\' t.lexer.push_state("escaped") return t
# Don't skip over any tokens inside the escaped state t_escaped_ignore = ""
[docs] def t_escaped_QUOTATION_MARK(self, t): r"\x22" # '"' t.lexer.pop_state() return t
[docs] def t_escaped_REVERSE_SOLIDUS(self, t): r"\x5C" # '\' t.lexer.pop_state() return t
[docs] def t_escaped_SOLIDUS(self, t): r"\x2F" # '/' t.lexer.pop_state() return t
[docs] def t_escaped_BACKSPACE_CHAR(self, t): r"\x62" # 'b' t.lexer.pop_state() t.value = chr(0x0008) return t
[docs] def t_escaped_FORM_FEED_CHAR(self, t): r"\x66" # 'f' t.lexer.pop_state() t.value = chr(0x000C) return t
[docs] def t_escaped_CARRIAGE_RETURN_CHAR(self, t): r"\x72" # 'r' t.lexer.pop_state() t.value = chr(0x000D) return t
[docs] def t_escaped_LINE_FEED_CHAR(self, t): r"\x6E" # 'n' t.lexer.pop_state() t.value = chr(0x000A) return t
[docs] def t_escaped_TAB_CHAR(self, t): r"\x74" # 't' t.lexer.pop_state() t.value = chr(0x0009) return t
[docs] def t_escaped_UNICODE_HEX(self, t): r"\x75[\x30-\x39,\x41-\x46,\x61-\x66]{4}" # 'uXXXX' t.lexer.pop_state() return t
# Stoichem
[docs] @lex.TOKEN(RE_ID) def t_stoi_ID(self, t): if t.value in RESERVED: t.type = RESERVED[t.value] if t.value == "using": t.lexer.pop_state() return t
[docs] def t_stoi_NUM_SYM_SEP(self, t): r"\x3A" return t
[docs] def t_stoi_STOI_SEP(self, t): r"\x2F" return t
[docs] @lex.TOKEN(RE_INT) def t_stoi_DIGITS(self, t): return t
t_stoi_ignore = "\x20\x09\x0D" # Tokenizer
[docs] def tokenize(self, data, *args, **kwargs): """Invoke the lexer on an input string an return the list of tokens. This is relatively inefficient and should only be used for testing/debugging as it slurps up all tokens into one list. Args: data: The input to be tokenized. Returns: A list of LexTokens """ self.lexer.input(data) tokens = list() while True: token = self.lexer.token() if not token: break tokens.append(token) return tokens
[docs] class Parser: def __init__(self, lexer=None, debug=False, **kwargs): if lexer is not None: if isinstance(lexer, Lexer): self.lexer = lexer.lexer else: # Assume that the lexer is a lex instance or similar self.lexer = lexer else: self.lexer = Lexer(debug=debug).lexer self.parser = yacc.yacc( module=self, debug=debug, write_tables=False, **kwargs, ) self.debug = debug tokens = TOKENS # Define the parser
[docs] def p_script(self, p): """ script : statements """ p[0] = p[1:]
[docs] def p_statements(self, p): """ statements : statement | statements NEWLINE statement | statements SEMICOLON statement """ if len(p) == 2: p[0] = [p[1]] elif len(p) == 3: p[1].append(p[2]) p[0] = p[1] elif len(p) == 4: p[1].append(p[3]) p[0] = p[1]
[docs] def p_statement(self, p): """ statement : command """ p[0] = p[1]
[docs] def p_command(self, p): """ command : | model | entity | predict | graph | def """ if len(p) == 1: p[0] = None else: p[0] = p[1]
[docs] def p_id(self, p): """id : ID""" p[0] = f"${p[1]}"
[docs] def p_variable_let(self, p): """ def : LET id object """ p[0] = {"command": "variable", "id": p[2], "value": p[3]}
[docs] def p_model_definition(self, p): """ model : MODEL id object """ p[0] = {"command": "model", "id": p[2], "models": p[3]}
[docs] def p_predict(self, p): """ predict : PREDICT string STOI stoi_list USING id | PREDICT string STOI stoi_list USING id pred_options | PREDICT id STOI stoi_list USING id | PREDICT id STOI stoi_list USING id pred_options """ if len(p) == 7: p[0] = {"command": "predict", "name": p[2], "stoi": p[4], "model": p[6], "options": []} elif len(p) == 8: p[0] = { "command": "predict", "name": p[2], "stoi": p[4], "model": p[6], "options": p[7], }
[docs] def p_pred_opts(self, p): """ pred_options : pred_option | pred_options pred_option """ if len(p) == 1: p[0] = [] elif len(p) == 2: p[0] = [p[1]] elif len(p) == 3: p[1].append(p[2]) p[0] = p[1]
[docs] def p_pred_opt(self, p): """ pred_option : PAIR string object | SAMPLE string object | IN object """ if len(p) == 4: p[0] = {"command": p[1], "mode": p[2], "options": p[3]} elif len(p) == 3: p[0] = {"command": p[1], "mode": "path", "options": p[2]}
[docs] def p_object(self, p): """ object : | value """ if len(p) == 1: p[0] = None else: p[0] = p[1]
[docs] def p_entity(self, p): """ entity : ENTITY id dict | ENTITY id string """ p[0] = {"command": "entity", "id": p[2], "options": p[3]}
[docs] def p_stoi_list(self, p): """ stoi_list : | stoi_list stoi_entry STOI_SEP | stoi_list stoi_entry """ if len(p) == 1: p[0] = list() else: p[1].append(p[2]) p[0] = p[1]
[docs] def p_stoi_entry(self, p): """ stoi_entry : id NUM_SYM_SEP integer """ p[0] = (p[1], p[3])
[docs] def p_graph(self, p): """ graph : GRAPH id edge_list """ p[0] = {"command": "graph", "id": p[2], "edge_list": p[3]}
[docs] def p_begin_edge(self, p): """ begin_edge : BEGIN_EDGE_LIST """ p[0] = None
[docs] def p_end_edge(self, p): """ end_edge : END_EDGE_LIST """ p[0] = None
[docs] def p_edge_list(self, p): """ edge_list : begin_edge edges end_edge """ p[0] = list(p[2])
[docs] def p_edges(self, p): """ edges : | edges edge value_separator | edges edge """ if len(p) == 1: p[0] = list() else: p[1].append(p[2]) p[0] = p[1]
[docs] def p_edge(self, p): """ edge : id EDGE_SEP id """ p[0] = (p[1], p[3])
# JSON parser
[docs] def p_value(self, p): """ value : dict | array | number | string | id """ p[0] = p[1]
[docs] def p_value_false(self, p): """value : FALSE""" p[0] = False
[docs] def p_value_true(self, p): """value : TRUE""" p[0] = True
[docs] def p_value_null(self, p): """value : NULL""" p[0] = None
[docs] def p_begin_dict(self, p): """ begin_dict : BEGIN_OBJECT """ p[0] = None
[docs] def p_end_dict(self, p): """ end_dict : END_OBJECT """ p[0] = None
[docs] def p_dict(self, p): """dict : begin_dict members end_dict""" p[0] = dict(p[2])
[docs] def p_value_seperator(self, p): """ value_separator : VALUE_SEPARATOR """ p[0] = None
[docs] def p_members(self, p): """ members : | members member value_separator | members member """ if len(p) == 1: p[0] = list() else: p[1].append(p[2]) p[0] = p[1]
[docs] def p_member(self, p): """member : string NAME_SEPARATOR value""" p[0] = (p[1], p[3])
[docs] def p_values(self, p): """ values : | values value value_separator | values value """ if len(p) == 1: p[0] = list() else: p[1].append(p[2]) p[0] = p[1]
[docs] def p_begin_array(self, p): """ begin_array : BEGIN_ARRAY """ p[0] = None
[docs] def p_end_array(self, p): """ end_array : END_ARRAY """ p[0] = None
[docs] def p_array(self, p): """array : begin_array values end_array""" p[0] = p[2]
[docs] def p_number_positive(self, p): """ number : integer | float """ p[0] = p[1]
[docs] def p_number_negative(self, p): """number : MINUS integer | MINUS float""" p[0] = -p[2]
[docs] def p_integer(self, p): """integer : int""" p[0] = p[1]
[docs] def p_integer_exp(self, p): """integer : int exp""" p[0] = p[1] * (10 ** p[2])
[docs] def p_number_float(self, p): """float : int frac""" p[0] = p[1] + p[2]
[docs] def p_number_float_exp(self, p): """float : int frac exp""" p[0] = (p[1] + p[2]) * (10 ** p[3])
[docs] def p_exp_negative(self, p): """exp : E MINUS DIGITS""" p[0] = -int(p[3])
[docs] def p_exp(self, p): """exp : E DIGITS""" p[0] = int(p[2])
[docs] def p_exp_positive(self, p): """exp : E PLUS DIGITS""" p[0] = int(p[3])
[docs] def p_frac(self, p): """frac : DECIMAL_POINT DIGITS""" p[0] = float("." + p[2])
[docs] def p_int_zero(self, p): """int : ZERO""" p[0] = int(0)
[docs] def p_int_non_zero(self, p): """int : DIGITS""" if p[1].startswith("0"): raise SyntaxError("Leading zeroes are not allowed.") p[0] = int(p[1])
[docs] def p_string(self, p): """string : QUOTATION_MARK chars QUOTATION_MARK""" p[0] = p[2]
[docs] def p_chars(self, p): """ chars : | chars char """ if len(p) == 1: p[0] = str() else: p[0] = p[1] + p[2]
[docs] def p_char(self, p): """ char : UNESCAPED | ESCAPE QUOTATION_MARK | ESCAPE REVERSE_SOLIDUS | ESCAPE SOLIDUS | ESCAPE BACKSPACE_CHAR | ESCAPE FORM_FEED_CHAR | ESCAPE LINE_FEED_CHAR | ESCAPE CARRIAGE_RETURN_CHAR | ESCAPE TAB_CHAR """ # Because the subscript [-1] has special meaning for YaccProduction # slices we use [len(p) - 1] to always take the last value. p[0] = p[len(p) - 1]
[docs] def p_char_unicode_hex(self, p): """char : ESCAPE UNICODE_HEX""" # This looks more complicated than it is. The escaped string is of # the form \uXXXX and is assigned to p[2]. We take the trailing # XXXX string via p[2][1:], parse it as a radix 16 (hex) integer, # and convert that to the corresponding unicode character. p[0] = chr(int(p[2][1:], 16))
[docs] def p_error(self, p): print(f"Syntax error at '{p}'")
# Invoke the parser
[docs] def parse(self, data, lexer=None, *args, **kwargs): if lexer is None: lexer = self.lexer if self.debug: lexer.input(data) print("=== TOKEN BEGIN ===") while True: tok = lexer.token() if not tok: break print(tok) print("==== TOKEN END ====") lines = self.parser.parse(data, lexer=lexer, *args, **kwargs) if lines is not None: lines = [x for x in lines[0] if x is not None] else: lines = [] return lines
# Maintain a reusable parser instance parser = None