utop/src/lib/uTop_lexer.mll

271 lines
8.1 KiB
OCaml

(*
* uTop_lexer.mll
* --------------
* Copyright : (c) 2011, Jeremie Dimino <jeremie@dimino.org>
* Licence : BSD3
*
* This file is a part of utop.
*)
(* Lexer for the OCaml language. *)
{
open Lexing
open UTop_token
let mkloc idx1 idx2 ofs1 ofs2 = {
idx1 = idx1;
idx2 = idx2;
ofs1 = ofs1;
ofs2 = ofs2;
}
(* Only for ascii-only lexemes. *)
let lexeme_loc idx lexbuf =
let ofs1 = lexeme_start lexbuf and ofs2 = lexeme_end lexbuf in
{
idx1 = idx;
idx2 = idx + (ofs2 - ofs1);
ofs1 = ofs1;
ofs2 = ofs2;
}
let _merge_loc l1 l2 = {
idx1 = l1.idx1;
idx2 = l2.idx2;
ofs1 = l1.ofs1;
ofs2 = l2.ofs2;
}
}
let uchar = ['\x00' - '\x7f'] | _ [ '\x80' - '\xbf' ]*
let blank = [' ' '\009' '\012']
let lowercase = ['a'-'z' '_']
let uppercase = ['A'-'Z']
let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9']
let lident = lowercase identchar*
let uident = uppercase identchar*
let ident = (lowercase|uppercase) identchar*
let hexa_char = ['0'-'9' 'A'-'F' 'a'-'f']
let decimal_literal =
['0'-'9'] ['0'-'9' '_']*
let hex_literal =
'0' ['x' 'X'] hexa_char ['0'-'9' 'A'-'F' 'a'-'f' '_']*
let oct_literal =
'0' ['o' 'O'] ['0'-'7'] ['0'-'7' '_']*
let bin_literal =
'0' ['b' 'B'] ['0'-'1'] ['0'-'1' '_']*
let int_literal =
decimal_literal | hex_literal | oct_literal | bin_literal
let float_literal =
['0'-'9'] ['0'-'9' '_']*
('.' ['0'-'9' '_']* )?
(['e' 'E'] ['+' '-']? ['0'-'9'] ['0'-'9' '_']*)?
let symbolchar =
['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~']
rule tokens idx acc = parse
| eof
{ (idx, None, List.rev acc) }
| ('\n' | blank)+
{ let loc = lexeme_loc idx lexbuf in
tokens loc.idx2 ((Blanks, loc) :: acc) lexbuf }
| lident
{ let src = lexeme lexbuf in
let loc = lexeme_loc idx lexbuf in
let tok =
match src with
| ("true" | "false") ->
Constant src
| _ ->
Lident src
in
tokens loc.idx2 ((tok, loc) :: acc) lexbuf }
| uident
{ let src = lexeme lexbuf in
let loc = lexeme_loc idx lexbuf in
let tok = Uident src in
tokens loc.idx2 ((tok, loc) :: acc) lexbuf }
| int_literal "l"
| int_literal "L"
| int_literal "n"
| int_literal
| float_literal
{ let loc = lexeme_loc idx lexbuf in
let tok = Constant (lexeme lexbuf) in
tokens loc.idx2 ((tok, loc) :: acc) lexbuf }
| '"'
{ let ofs = lexeme_start lexbuf in
let item, idx2= cm_string (idx + 1) lexbuf in
let loc = mkloc idx idx2 ofs (lexeme_end lexbuf) in
tokens idx2 ((item, loc) :: acc) lexbuf }
| '{' (lowercase* as tag) '|'
{ let ofs = lexeme_start lexbuf in
let delim_len = String.length tag + 2 in
let idx2, terminated = quoted_string (idx + delim_len) tag false lexbuf in
let loc = mkloc idx idx2 ofs (lexeme_end lexbuf) in
tokens idx2 ((String (delim_len, terminated), loc) :: acc) lexbuf }
| "'" [^'\'' '\\'] "'"
| "'\\" ['\\' '"' 'n' 't' 'b' 'r' ' ' '\'' 'x' '0'-'9'] eof
| "'\\" ['\\' '"' 'n' 't' 'b' 'r' ' ' '\''] "'"
| "'\\" (['0'-'9'] ['0'-'9'] | 'x' hexa_char) eof
| "'\\" (['0'-'9'] ['0'-'9'] ['0'-'9'] | 'x' hexa_char hexa_char) eof
| "'\\" (['0'-'9'] ['0'-'9'] ['0'-'9'] | 'x' hexa_char hexa_char) "'"
{ let loc = lexeme_loc idx lexbuf in
tokens loc.idx2 ((Char, loc) :: acc) lexbuf }
| "'\\" uchar
{ let loc = mkloc idx (idx + 3) (lexeme_start lexbuf) (lexeme_end lexbuf) in
tokens loc.idx2 ((Error, loc) :: acc) lexbuf }
| "(*)"
{ let loc = lexeme_loc idx lexbuf in
tokens loc.idx2 ((Comment (Comment_reg, true), loc) :: acc) lexbuf }
| "(**)"
{ let loc = lexeme_loc idx lexbuf in
tokens loc.idx2 ((Comment (Comment_doc, true), loc) :: acc) lexbuf }
| "(**"
{ let ofs = lexeme_start lexbuf in
let idx2, terminated = comment (idx + 3) 0 false lexbuf in
let loc = mkloc idx idx2 ofs (lexeme_end lexbuf) in
tokens idx2 ((Comment (Comment_doc, terminated), loc) :: acc) lexbuf }
| "(*"
{ let ofs = lexeme_start lexbuf in
let idx2, terminated = comment (idx + 2) 0 false lexbuf in
let loc = mkloc idx idx2 ofs (lexeme_end lexbuf) in
tokens idx2 ((Comment (Comment_reg, terminated), loc) :: acc) lexbuf }
| ""
{ symbol idx acc lexbuf }
and symbol idx acc = parse
| "(" | ")"
| "[" | "]"
| "{" | "}"
| "`"
| "#"
| ","
| ";" | ";;"
| symbolchar+
{ let loc = lexeme_loc idx lexbuf in
let tok = Symbol (lexeme lexbuf) in
tokens loc.idx2 ((tok, loc) :: acc) lexbuf }
| uchar as uchar
{ let uChar= Zed_utf8.unsafe_extract uchar 0 in
if Zed_char.is_combining_mark uChar then
let tok, loc= List.hd acc
and tl= List.tl acc in
let tok= match tok with
| Symbol str-> Symbol (str ^ (lexeme lexbuf))
| Lident str-> Lident (str ^ (lexeme lexbuf))
| Uident str-> Uident (str ^ (lexeme lexbuf))
| Constant str-> Constant (str ^ (lexeme lexbuf))
| _-> tok
in
let loc= { loc with ofs2= lexeme_end lexbuf } in
tokens loc.idx2 ((tok, loc) :: tl) lexbuf
else
let loc = mkloc idx (idx + 1) (lexeme_start lexbuf) (lexeme_end lexbuf) in
tokens loc.idx2 ((Error, loc) :: acc) lexbuf
}
and cm_string idx= parse
| '"'
{ (String (1, true), idx+1) }
| "\\\""
{ let idx2, terminated= string (idx + 2) false lexbuf in
(String (1, terminated), idx2)
}
| uchar as uchar
{
let uChar= Zed_utf8.unsafe_extract uchar 0 in
if Zed_char.is_combining_mark uChar then
cm_string idx lexbuf
else
let idx2, terminated= string (idx + 1) true lexbuf in
(String (1, terminated), idx2)
}
| eof
{ (String (1, false), idx) }
and comment idx depth combining= parse
| "(*"
{ comment (idx + 2) (depth + 1) false lexbuf }
| "*)"
{ if depth = 0 then
(idx + 2, true)
else
comment (idx + 2) (depth - 1) false lexbuf }
| '"'
{ let idx, terminated = string (idx + 1) false lexbuf in
if terminated then
comment idx depth false lexbuf
else
(idx, false) }
| uchar as uchar
{ let uChar= Zed_utf8.unsafe_extract uchar 0 in
if not combining then
if Zed_char.is_combining_mark uChar then
comment (idx + 1) depth false lexbuf
else
comment (idx + 1) depth true lexbuf
else
if Zed_char.is_combining_mark uChar then
comment idx depth true lexbuf
else
comment (idx + 1) depth true lexbuf
}
| eof
{ (idx, false) }
and string idx combining= parse
| '"'
{ (idx + 1, true) }
| "\\\""
{ string (idx + 2) false lexbuf }
| uchar as uchar
{ let uChar= Zed_utf8.unsafe_extract uchar 0 in
if not combining then
if Zed_char.is_combining_mark uChar then
string (idx + 1) false lexbuf
else
string (idx + 1) true lexbuf
else
if Zed_char.is_combining_mark uChar then
string idx true lexbuf
else
string (idx + 1) true lexbuf
}
| eof
{ (idx, false) }
and quoted_string idx tag combining= parse
| '|' (lowercase* as tag2) '}'
{ let idx = idx + 2 + String.length tag2 in
if tag = tag2 then
(idx, true)
else
quoted_string idx tag false lexbuf }
| eof
{ (idx, false) }
| uchar as uchar
{ let uChar= Zed_utf8.unsafe_extract uchar 0 in
if not combining then
if Zed_char.is_combining_mark uChar then
quoted_string (idx + 1) tag false lexbuf
else
quoted_string (idx + 1) tag true lexbuf
else
if Zed_char.is_combining_mark uChar then
quoted_string idx tag true lexbuf
else
quoted_string (idx + 1) tag true lexbuf
}
{
let lex_string str =
let _, _, items = tokens 0 [] (Lexing.from_string str) in
items
}