PER: tokenizer

This commit is contained in:
Triss 2022-06-02 03:37:38 +02:00
parent 53d3501604
commit 07ae409fdf
1 changed files with 218 additions and 0 deletions

218
t32per/pertoken.py Executable file
View File

@ -0,0 +1,218 @@
#!/usr/bin/env python3
from enum import Enum
from typing import *
class PerTokenName(Enum):
Title = "; @Title"
Props = "; @Props"
Author = "; @Author"
Changelog = "; @Changelog"
Manufacturer = "; @Manufacturer"
Doc = "; @Doc"
Core = "; @Core"
Chip = "; @Chip"
Chiplist = "; @Chiplist"
Copyright = "; @Copyright"
Description = "; @Description"
Keywords = "; @Keywords"
Date = "; @Date"
#HeaderSep = "; -------"
Id = "; @Id"
CONFIG = "config" # what is this
BASE = "base" # base address, base addrspace:expr
# expr can be int or (d.l(addrspace:off)) -> ???
WIDTH = "width" # what is this
SAVEINDEX = "saveindex" # what is this
TREE_OPEN = "tree.open"
TREE_CLOSE = "tree.close" # like tree.open
TREE_END = "tree.end"
TREE = "tree"
# group of stuff (sometimes one reg?)
# group (addrspace:)start--end
# addrspaces: c15 c14 ad(=what?) <none> EE(avr eeprom) D(avr data?) d(msp430) CSR(riscv) NAR(xtensa shit), "e:comp.base('name',-1)"(also xtensa) SPR(xtensa)
# also seen: group iospace() (teaklite)
GROUP = "group"
HGROUP = "hgroup"
RGROUP = "rgroup"
SGROUP = "sgroup"
WGROUP = "wgroup"
# a register (sometimes hidden)
# line.qual offset "CODENAME,description"
HIDE = "hide"
LINE = "line"
# bitfields and stuff in registers
# bitfld.qual IDK start(--end) "CODE,description" ("if 0","if 1",...)
BITFLD = "bitfld"
ABITFLD = "abitfld"
RBITFLD = "rbitfld"
HEXFLD = "hexfld"
#RHEXFLD? WHEXFLD??
EVENTFLD = "eventfld"
SETCLRFLD = "setclrfld"
# masks in registers?
# hexmask.qual IDK start--end MASK "CODE,description"
DECMASK = "decmask"
#RDECMASK? WDECMASK??
HEXMASK = "hexmask"
RHEXMASK = "rhexmask"
#WHEXMASK??
COPY = "copy"
REPEAT_END = "repeat.end"
REPEAT_REPLAY = "repeat.replay"
REPEAT = "repeat"
# to ignore: assert autoindent.{on,off} button elif else endif if in
# newline sif textline textfld x
# IDK: entry, read, wait, saveindex, saveout, set, getx, register
# EH: include, endian.{be,le} (-> conditional endianness)
# TODO: copy: copy from previous group? (->derivedFrom whee)
# TODO: repeat{,.end}: repeat macro stuff (cf. dimIndexElement)
# TODO: repeat.replay: copy+replay
PER_TOKEN_IGNORE = {
'assert','autoindent.on','autoindent.off','button','elif','else','endif',
'if','in','newline','sif','textline','textfld','x',
'entry','read','wait','saveindex','saveout','set','getx','register',
'include','endian.be','endian.le'
}
PER_TOKEN_HEADER = [PerTokenName.__members__[x] for x in (
'Title','Props','Author','Changelog','Manufacturer','Doc','Core','Chiplist',
'Copyright','Id','Chip','Description','Keywords','Date',#'HeaderSep',
)]
PER_TOKEN_BODY = [v for k, v in PerTokenName.__members__.items() if v not in PER_TOKEN_HEADER]
class PerTokenQual(Enum):
# also .<hex>?
BYTE = "byte"
WORD = "word"
LONG = "long"
QUAD = "quad"
SHORT = "short"
SBYTE = "sbyte"
TBYTE = "tbyte"
# used for mask stuff
"""
LONG_TBYTE = "long.tbyte"
LONG_BYTE = "long.byte"
LONG_WORD = "long.word"
LONG_LONG = "long.long"
WORD_BYTE = "word.byte"
WORD_WORD = "word.word"
BYTE_BYTE = "byte.byte"
QUAD_BYTE = "quad.byte"
QUAD_SBYTE = "quad.sbyte"
QUAD_TBYTE = "quad.tbyte"
QUAD_WORD = "quad.word"
QUAD_SHORT = "quad.short"
QUAD_LONG = "quad.long"
QUAD_QUAD = "quad.quad"
TBYTE_BYTE = "tbyte.byte"
TBYTE_WORD = "tbyte.word"
"""
class PerToken(NamedTuple):
name: PerTokenName
qual: Union[PerTokenQual, Tuple[PerTokenQual, PerTokenQual]] = None
args: List[str] = []
# TODO: tokenize into data stream with useful arguments
# TODO: data stream -> tree structure
def find_tok(l: str, hdr: bool) -> PerTokenName:
ll=l.lower() if hdr else l
for h in (PER_TOKEN_HEADER if hdr else PER_TOKEN_BODY):
#print("ll='%s', h='%s'"%(ll,h.value))
if ll.startswith(h.value.lower() if hdr else h.value):
return h
assert not hdr or l.split()[0] in PER_TOKEN_IGNORE, "Unknown token on line: %s"%l
return None
def tokenize_body(f, l=None):
prevtell = -1
while True:
if l is None: l = f.readline().strip()
if len(l) == 0:
tell = f.tell()
if tell == prevtell: break # EOF
prevtell = tell
continue
ll = l.lower()
sp = l.split()
# regular token
t = find_tok(ll, False)
#print("t",t)
if t is not None:
yield PerToken(t, args=sp[1:])
l = None
continue
ht = sp[0].lower().split('.')
assert len(ht) > 0 or sp[0].lower() in PER_TOKEN_IGNORE, "Unknown token on line: %s"%l
t = find_tok(ht[0], False)
if t is not None:
assert len(ht) in {2,3},"bad qual %s in line %s"%(repr(ht),l)
quals = [PerTokenQual.__members__[k] for k in ht[1:]]
if len(quals) == 1: quals = quals[0]
else: quals = tuple(quals)
yield PerToken(t, qual=quals, args=sp[1:])
l = None
continue
l = None
def tokenize(f):
curtok = None
curlines = []
prevtell = -1
while True:
l = f.readline().strip()
if len(l) == 0:
tell = f.tell()
if tell == prevtell: break # EOF
prevtell = tell
continue
if l[0] != ';':
yield from tokenize_body(f,l)
break
if l.startswith('; -------'): continue
if l.startswith('; @') and l[3] != ' ':
# new token!
# flush old one
if curtok is not None:
yield PerToken(curtok, args=curlines)
# start new
curtok = find_tok(l, True)
curlines = [l[len(curtok.value)+1:].strip()]
else:
curlines.append(l[3:].strip())
# flatten copy and repeat statements
def tokenize_flatten(itor):
yield from itor # TODO
if __name__ == '__main__':
import glob
for p in glob.glob('t/*/*.per'):
print(p)
with open(p,'r') as f:
for x in tokenize(f): pass
with open('t/arm/peram65xx.per','r') as f:
for x in tokenize(f): print(x)
with open('t/arm/perfm0p.per','r') as f:
for x in tokenize(f): print(x)