PER: tokenizer

2022-06-02 03:37:38 +02:00 · 2022-06-02 03:37:38 +02:00 · 07ae409fdf
parent 53d3501604
commit 07ae409fdf
1 changed files with 218 additions and 0 deletions
--- a/t32per/pertoken.py
+++ b/t32per/pertoken.py
@ -0,0 +1,218 @@
 #!/usr/bin/env python3
 from enum import Enum
 from typing import *
 class PerTokenName(Enum):
    Title = "; @Title"
    Props = "; @Props"
    Author = "; @Author"
    Changelog = "; @Changelog"
    Manufacturer = "; @Manufacturer"
    Doc = "; @Doc"
    Core = "; @Core"
    Chip = "; @Chip"
    Chiplist = "; @Chiplist"
    Copyright = "; @Copyright"
    Description = "; @Description"
    Keywords = "; @Keywords"
    Date = "; @Date"
    #HeaderSep = "; -------"
    Id = "; @Id"
    CONFIG = "config" # what is this
    BASE = "base" # base address, base addrspace:expr
    # expr can be int or (d.l(addrspace:off)) -> ???
    WIDTH = "width" # what is this
    SAVEINDEX = "saveindex" # what is this
    TREE_OPEN = "tree.open"
    TREE_CLOSE = "tree.close" # like tree.open
    TREE_END = "tree.end"
    TREE = "tree"
    # group of stuff (sometimes one reg?)
    # group (addrspace:)start--end
    # addrspaces: c15 c14 ad(=what?) <none> EE(avr eeprom) D(avr data?) d(msp430) CSR(riscv) NAR(xtensa shit), "e:comp.base('name',-1)"(also xtensa) SPR(xtensa)
    # also seen: group iospace() (teaklite)
    GROUP = "group"
    HGROUP = "hgroup"
    RGROUP = "rgroup"
    SGROUP = "sgroup"
    WGROUP = "wgroup"
    # a register (sometimes hidden)
    # line.qual offset "CODENAME,description"
    HIDE = "hide"
    LINE = "line"
    # bitfields and stuff in registers
    # bitfld.qual IDK start(--end) "CODE,description" ("if 0","if 1",...)
    BITFLD = "bitfld"
    ABITFLD = "abitfld"
    RBITFLD = "rbitfld"
    HEXFLD = "hexfld"
    #RHEXFLD? WHEXFLD??
    EVENTFLD = "eventfld"
    SETCLRFLD = "setclrfld"
    # masks in registers?
    # hexmask.qual IDK start--end MASK "CODE,description"
    DECMASK = "decmask"
    #RDECMASK? WDECMASK??
    HEXMASK = "hexmask"
    RHEXMASK = "rhexmask"
    #WHEXMASK??
    COPY = "copy"
    REPEAT_END = "repeat.end"
    REPEAT_REPLAY = "repeat.replay"
    REPEAT = "repeat"
    # to ignore: assert autoindent.{on,off} button elif else endif if in
    #            newline sif textline textfld x
    # IDK: entry, read, wait, saveindex, saveout, set, getx, register
    # EH: include, endian.{be,le} (-> conditional endianness)
    # TODO: copy: copy from previous group? (->derivedFrom whee)
    # TODO: repeat{,.end}: repeat macro stuff (cf. dimIndexElement)
    # TODO: repeat.replay: copy+replay
 PER_TOKEN_IGNORE = {
    'assert','autoindent.on','autoindent.off','button','elif','else','endif',
    'if','in','newline','sif','textline','textfld','x',
    'entry','read','wait','saveindex','saveout','set','getx','register',
    'include','endian.be','endian.le'
 }
 PER_TOKEN_HEADER = [PerTokenName.__members__[x] for x in (
    'Title','Props','Author','Changelog','Manufacturer','Doc','Core','Chiplist',
    'Copyright','Id','Chip','Description','Keywords','Date',#'HeaderSep',
 )]
 PER_TOKEN_BODY = [v for k, v in PerTokenName.__members__.items() if v not in PER_TOKEN_HEADER]
 class PerTokenQual(Enum):
    # also .<hex>?
    BYTE = "byte"
    WORD = "word"
    LONG = "long"
    QUAD = "quad"
    SHORT = "short"
    SBYTE = "sbyte"
    TBYTE = "tbyte"
    # used for mask stuff
    """
    LONG_TBYTE = "long.tbyte"
    LONG_BYTE = "long.byte"
    LONG_WORD = "long.word"
    LONG_LONG = "long.long"
    WORD_BYTE = "word.byte"
    WORD_WORD = "word.word"
    BYTE_BYTE = "byte.byte"
    QUAD_BYTE = "quad.byte"
    QUAD_SBYTE = "quad.sbyte"
    QUAD_TBYTE = "quad.tbyte"
    QUAD_WORD = "quad.word"
    QUAD_SHORT = "quad.short"
    QUAD_LONG = "quad.long"
    QUAD_QUAD = "quad.quad"
    TBYTE_BYTE = "tbyte.byte"
    TBYTE_WORD = "tbyte.word"
    """
 class PerToken(NamedTuple):
    name: PerTokenName
    qual: Union[PerTokenQual, Tuple[PerTokenQual, PerTokenQual]] = None
    args: List[str] = []
 # TODO: tokenize into data stream with useful arguments
 # TODO: data stream -> tree structure
 def find_tok(l: str, hdr: bool) -> PerTokenName:
    ll=l.lower() if hdr else l
    for h in (PER_TOKEN_HEADER if hdr else PER_TOKEN_BODY):
        #print("ll='%s', h='%s'"%(ll,h.value))
        if ll.startswith(h.value.lower() if hdr else h.value):
            return h
    assert not hdr or l.split()[0] in PER_TOKEN_IGNORE, "Unknown token on line: %s"%l
    return None
 def tokenize_body(f, l=None):
    prevtell = -1
    while True:
        if l is None: l = f.readline().strip()
        if len(l) == 0:
            tell = f.tell()
            if tell == prevtell: break # EOF
            prevtell = tell
            continue
        ll = l.lower()
        sp = l.split()
        # regular token
        t = find_tok(ll, False)
        #print("t",t)
        if t is not None:
            yield PerToken(t, args=sp[1:])
            l = None
            continue
        ht = sp[0].lower().split('.')
        assert len(ht) > 0 or sp[0].lower() in PER_TOKEN_IGNORE, "Unknown token on line: %s"%l
        t = find_tok(ht[0], False)
        if t is not None:
            assert len(ht) in {2,3},"bad qual %s in line %s"%(repr(ht),l)
            quals = [PerTokenQual.__members__[k] for k in ht[1:]]
            if len(quals) == 1: quals = quals[0]
            else: quals = tuple(quals)
            yield PerToken(t, qual=quals, args=sp[1:])
            l = None
            continue
        l = None
 def tokenize(f):
    curtok = None
    curlines = []
    prevtell = -1
    while True:
        l = f.readline().strip()
        if len(l) == 0:
            tell = f.tell()
            if tell == prevtell: break # EOF
            prevtell = tell
            continue
        if l[0] != ';':
            yield from tokenize_body(f,l)
            break
        if l.startswith('; -------'): continue
        if l.startswith('; @') and l[3] != ' ':
            # new token!
            # flush old one
            if curtok is not None:
                yield PerToken(curtok, args=curlines)
            # start new
            curtok = find_tok(l, True)
            curlines = [l[len(curtok.value)+1:].strip()]
        else:
            curlines.append(l[3:].strip())
 # flatten copy and repeat statements
 def tokenize_flatten(itor):
    yield from itor # TODO
 if __name__ == '__main__':
    import glob
    for p in glob.glob('t/*/*.per'):
        print(p)
        with open(p,'r') as f:
            for x in tokenize(f): pass
    with open('t/arm/peram65xx.per','r') as f:
        for x in tokenize(f): print(x)
    with open('t/arm/perfm0p.per','r') as f:
        for x in tokenize(f): print(x)