PER: tokenizer

2022-06-02 03:37:38 +02:00 · 2022-06-02 03:37:38 +02:00 · 07ae409fdf
parent 53d3501604
commit 07ae409fdf
1 changed files with 218 additions and 0 deletions
--- a/t32per/pertoken.py
+++ b/t32per/pertoken.py
@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+
+from enum import Enum
+from typing import *
+
+class PerTokenName(Enum):
+    Title = "; @Title"
+    Props = "; @Props"
+    Author = "; @Author"
+    Changelog = "; @Changelog"
+    Manufacturer = "; @Manufacturer"
+    Doc = "; @Doc"
+    Core = "; @Core"
+    Chip = "; @Chip"
+    Chiplist = "; @Chiplist"
+    Copyright = "; @Copyright"
+    Description = "; @Description"
+    Keywords = "; @Keywords"
+    Date = "; @Date"
+    #HeaderSep = "; -------"
+    Id = "; @Id"
+
+    CONFIG = "config" # what is this
+    BASE = "base" # base address, base addrspace:expr
+    # expr can be int or (d.l(addrspace:off)) -> ???
+    WIDTH = "width" # what is this
+    SAVEINDEX = "saveindex" # what is this
+
+    TREE_OPEN = "tree.open"
+    TREE_CLOSE = "tree.close" # like tree.open
+    TREE_END = "tree.end"
+    TREE = "tree"
+
+    # group of stuff (sometimes one reg?)
+    # group (addrspace:)start--end
+    # addrspaces: c15 c14 ad(=what?) <none> EE(avr eeprom) D(avr data?) d(msp430) CSR(riscv) NAR(xtensa shit), "e:comp.base('name',-1)"(also xtensa) SPR(xtensa)
+    # also seen: group iospace() (teaklite)
+    GROUP = "group"
+    HGROUP = "hgroup"
+    RGROUP = "rgroup"
+    SGROUP = "sgroup"
+    WGROUP = "wgroup"
+    # a register (sometimes hidden)
+    # line.qual offset "CODENAME,description"
+    HIDE = "hide"
+    LINE = "line"
+    # bitfields and stuff in registers
+    # bitfld.qual IDK start(--end) "CODE,description" ("if 0","if 1",...)
+    BITFLD = "bitfld"
+    ABITFLD = "abitfld"
+    RBITFLD = "rbitfld"
+    HEXFLD = "hexfld"
+    #RHEXFLD? WHEXFLD??
+    EVENTFLD = "eventfld"
+    SETCLRFLD = "setclrfld"
+    # masks in registers?
+    # hexmask.qual IDK start--end MASK "CODE,description"
+    DECMASK = "decmask"
+    #RDECMASK? WDECMASK??
+    HEXMASK = "hexmask"
+    RHEXMASK = "rhexmask"
+    #WHEXMASK??
+
+    COPY = "copy"
+    REPEAT_END = "repeat.end"
+    REPEAT_REPLAY = "repeat.replay"
+    REPEAT = "repeat"
+
+    # to ignore: assert autoindent.{on,off} button elif else endif if in
+    #            newline sif textline textfld x
+    # IDK: entry, read, wait, saveindex, saveout, set, getx, register
+    # EH: include, endian.{be,le} (-> conditional endianness)
+    # TODO: copy: copy from previous group? (->derivedFrom whee)
+    # TODO: repeat{,.end}: repeat macro stuff (cf. dimIndexElement)
+    # TODO: repeat.replay: copy+replay
+
+PER_TOKEN_IGNORE = {
+    'assert','autoindent.on','autoindent.off','button','elif','else','endif',
+    'if','in','newline','sif','textline','textfld','x',
+    'entry','read','wait','saveindex','saveout','set','getx','register',
+    'include','endian.be','endian.le'
+}
+PER_TOKEN_HEADER = [PerTokenName.__members__[x] for x in (
+    'Title','Props','Author','Changelog','Manufacturer','Doc','Core','Chiplist',
+    'Copyright','Id','Chip','Description','Keywords','Date',#'HeaderSep',
+)]
+PER_TOKEN_BODY = [v for k, v in PerTokenName.__members__.items() if v not in PER_TOKEN_HEADER]
+
+class PerTokenQual(Enum):
+    # also .<hex>?
+    BYTE = "byte"
+    WORD = "word"
+    LONG = "long"
+    QUAD = "quad"
+    SHORT = "short"
+    SBYTE = "sbyte"
+    TBYTE = "tbyte"
+    # used for mask stuff
+    """
+    LONG_TBYTE = "long.tbyte"
+    LONG_BYTE = "long.byte"
+    LONG_WORD = "long.word"
+    LONG_LONG = "long.long"
+    WORD_BYTE = "word.byte"
+    WORD_WORD = "word.word"
+    BYTE_BYTE = "byte.byte"
+    QUAD_BYTE = "quad.byte"
+    QUAD_SBYTE = "quad.sbyte"
+    QUAD_TBYTE = "quad.tbyte"
+    QUAD_WORD = "quad.word"
+    QUAD_SHORT = "quad.short"
+    QUAD_LONG = "quad.long"
+    QUAD_QUAD = "quad.quad"
+    TBYTE_BYTE = "tbyte.byte"
+    TBYTE_WORD = "tbyte.word"
+    """
+
+
+class PerToken(NamedTuple):
+    name: PerTokenName
+    qual: Union[PerTokenQual, Tuple[PerTokenQual, PerTokenQual]] = None
+    args: List[str] = []
+
+
+# TODO: tokenize into data stream with useful arguments
+# TODO: data stream -> tree structure
+
+def find_tok(l: str, hdr: bool) -> PerTokenName:
+    ll=l.lower() if hdr else l
+    for h in (PER_TOKEN_HEADER if hdr else PER_TOKEN_BODY):
+        #print("ll='%s', h='%s'"%(ll,h.value))
+        if ll.startswith(h.value.lower() if hdr else h.value):
+            return h
+    assert not hdr or l.split()[0] in PER_TOKEN_IGNORE, "Unknown token on line: %s"%l
+    return None
+
+
+def tokenize_body(f, l=None):
+    prevtell = -1
+    while True:
+        if l is None: l = f.readline().strip()
+        if len(l) == 0:
+            tell = f.tell()
+            if tell == prevtell: break # EOF
+            prevtell = tell
+            continue
+        ll = l.lower()
+        sp = l.split()
+        # regular token
+        t = find_tok(ll, False)
+        #print("t",t)
+        if t is not None:
+            yield PerToken(t, args=sp[1:])
+            l = None
+            continue
+
+        ht = sp[0].lower().split('.')
+        assert len(ht) > 0 or sp[0].lower() in PER_TOKEN_IGNORE, "Unknown token on line: %s"%l
+        t = find_tok(ht[0], False)
+        if t is not None:
+            assert len(ht) in {2,3},"bad qual %s in line %s"%(repr(ht),l)
+            quals = [PerTokenQual.__members__[k] for k in ht[1:]]
+            if len(quals) == 1: quals = quals[0]
+            else: quals = tuple(quals)
+            yield PerToken(t, qual=quals, args=sp[1:])
+            l = None
+            continue
+
+        l = None
+
+
+def tokenize(f):
+    curtok = None
+    curlines = []
+
+    prevtell = -1
+    while True:
+        l = f.readline().strip()
+        if len(l) == 0:
+            tell = f.tell()
+            if tell == prevtell: break # EOF
+            prevtell = tell
+            continue
+        if l[0] != ';':
+            yield from tokenize_body(f,l)
+            break
+        if l.startswith('; -------'): continue
+
+        if l.startswith('; @') and l[3] != ' ':
+            # new token!
+            # flush old one
+            if curtok is not None:
+                yield PerToken(curtok, args=curlines)
+            # start new
+            curtok = find_tok(l, True)
+            curlines = [l[len(curtok.value)+1:].strip()]
+        else:
+            curlines.append(l[3:].strip())
+
+
+# flatten copy and repeat statements
+def tokenize_flatten(itor):
+    yield from itor # TODO
+
+
+if __name__ == '__main__':
+    import glob
+    for p in glob.glob('t/*/*.per'):
+        print(p)
+        with open(p,'r') as f:
+            for x in tokenize(f): pass
+
+    with open('t/arm/peram65xx.per','r') as f:
+        for x in tokenize(f): print(x)
+    with open('t/arm/perfm0p.per','r') as f:
+        for x in tokenize(f): print(x)
+
+