utop/src/lib/uTop_lexer.mll

(*
 * uTop_lexer.mll
 * --------------
 * Copyright : (c) 2011, Jeremie Dimino <jeremie@dimino.org>
 * Licence   : BSD3
 *
 * This file is a part of utop.
 *)

(* Lexer for the OCaml language. *)

{
  open Lexing
  open UTop_token

  let mkloc idx1 idx2 ofs1 ofs2 = {
    idx1 = idx1;
    idx2 = idx2;
    ofs1 = ofs1;
    ofs2 = ofs2;
  }

  (* Only for ascii-only lexemes. *)
  let lexeme_loc idx lexbuf =
    let ofs1 = lexeme_start lexbuf and ofs2 = lexeme_end lexbuf in
    {
      idx1 = idx;
      idx2 = idx + (ofs2 - ofs1);
      ofs1 = ofs1;
      ofs2 = ofs2;
    }

  let _merge_loc l1 l2 = {
    idx1 = l1.idx1;
    idx2 = l2.idx2;
    ofs1 = l1.ofs1;
    ofs2 = l2.ofs2;
  }

}

let uchar = ['\x00' - '\x7f'] | _ [ '\x80' - '\xbf' ]*

let blank = [' ' '\009' '\012']
let lowercase = ['a'-'z' '_']
let uppercase = ['A'-'Z']
let identchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9']
let lident = lowercase identchar*
let uident = uppercase identchar*
let ident = (lowercase|uppercase) identchar*

let hexa_char = ['0'-'9' 'A'-'F' 'a'-'f']
let decimal_literal =
  ['0'-'9'] ['0'-'9' '_']*
let hex_literal =
  '0' ['x' 'X'] hexa_char ['0'-'9' 'A'-'F' 'a'-'f' '_']*
let oct_literal =
  '0' ['o' 'O'] ['0'-'7'] ['0'-'7' '_']*
let bin_literal =
  '0' ['b' 'B'] ['0'-'1'] ['0'-'1' '_']*
let int_literal =
  decimal_literal | hex_literal | oct_literal | bin_literal
let float_literal =
  ['0'-'9'] ['0'-'9' '_']*
  ('.' ['0'-'9' '_']* )?
  (['e' 'E'] ['+' '-']? ['0'-'9'] ['0'-'9' '_']*)?

let symbolchar =
  ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~']

rule tokens idx acc = parse
  | eof
      { (idx, None, List.rev acc) }
  | ('\n' | blank)+
      { let loc = lexeme_loc idx lexbuf in
        tokens loc.idx2 ((Blanks, loc) :: acc) lexbuf }
  | lident
      { let src = lexeme lexbuf in
        let loc = lexeme_loc idx lexbuf in
        let tok =
          match src with
            | ("true" | "false") ->
                Constant src
            | _ ->
                Lident src
        in
        tokens loc.idx2 ((tok, loc) :: acc) lexbuf }
  | uident
      { let src = lexeme lexbuf in
        let loc = lexeme_loc idx lexbuf in
        let tok = Uident src in
        tokens loc.idx2 ((tok, loc) :: acc) lexbuf }
  | int_literal "l"
  | int_literal "L"
  | int_literal "n"
  | int_literal
  | float_literal
      { let loc = lexeme_loc idx lexbuf in
        let tok = Constant (lexeme lexbuf) in
        tokens loc.idx2 ((tok, loc) :: acc) lexbuf }
  | '"'
      { let ofs = lexeme_start lexbuf in
        let item, idx2= cm_string (idx + 1) lexbuf in
        let loc = mkloc idx idx2 ofs (lexeme_end lexbuf) in
        tokens idx2 ((item, loc) :: acc) lexbuf }
  | '{' (lowercase* as tag) '|'
      { let ofs = lexeme_start lexbuf in
        let delim_len = String.length tag + 2 in
        let idx2, terminated = quoted_string (idx + delim_len) tag false lexbuf in
        let loc = mkloc idx idx2 ofs (lexeme_end lexbuf) in
        tokens idx2 ((String (delim_len, terminated), loc) :: acc) lexbuf }
  | "'" [^'\'' '\\'] "'"
  | "'\\" ['\\' '"' 'n' 't' 'b' 'r' ' ' '\'' 'x' '0'-'9'] eof
  | "'\\" ['\\' '"' 'n' 't' 'b' 'r' ' ' '\''] "'"
  | "'\\" (['0'-'9'] ['0'-'9'] | 'x' hexa_char) eof
  | "'\\" (['0'-'9'] ['0'-'9'] ['0'-'9'] | 'x' hexa_char hexa_char) eof
  | "'\\" (['0'-'9'] ['0'-'9'] ['0'-'9'] | 'x' hexa_char hexa_char) "'"
      { let loc = lexeme_loc idx lexbuf in
        tokens loc.idx2 ((Char, loc) :: acc) lexbuf }
  | "'\\" uchar
      { let loc = mkloc idx (idx + 3) (lexeme_start lexbuf) (lexeme_end lexbuf) in
        tokens loc.idx2 ((Error, loc) :: acc) lexbuf }
  | "(*)"
      { let loc = lexeme_loc idx lexbuf in
        tokens loc.idx2 ((Comment (Comment_reg, true), loc) :: acc) lexbuf }
  | "(**)"
      { let loc = lexeme_loc idx lexbuf in
        tokens loc.idx2 ((Comment (Comment_doc, true), loc) :: acc) lexbuf }
  | "(**"
      { let ofs = lexeme_start lexbuf in
        let idx2, terminated = comment (idx + 3) 0 false lexbuf in
        let loc = mkloc idx idx2 ofs (lexeme_end lexbuf) in
        tokens idx2 ((Comment (Comment_doc, terminated), loc) :: acc) lexbuf }
  | "(*"
      { let ofs = lexeme_start lexbuf in
        let idx2, terminated = comment (idx + 2) 0 false lexbuf in
        let loc = mkloc idx idx2 ofs (lexeme_end lexbuf) in
        tokens idx2 ((Comment (Comment_reg, terminated), loc) :: acc) lexbuf }
  | ""
      { symbol idx acc lexbuf }

and symbol idx acc = parse
  | "(" | ")"
  | "[" | "]"
  | "{" | "}"
  | "`"
  | "#"
  | ","
  | ";" | ";;"
  | symbolchar+
      { let loc = lexeme_loc idx lexbuf in
        let tok = Symbol (lexeme lexbuf) in
        tokens loc.idx2 ((tok, loc) :: acc) lexbuf }
  | uchar as uchar
      { let uChar= Zed_utf8.unsafe_extract uchar 0 in
        if Zed_char.is_combining_mark uChar then
          let tok, loc= List.hd acc
          and tl= List.tl acc in
          let tok= match tok with
            | Symbol str-> Symbol (str ^ (lexeme lexbuf))
            | Lident str-> Lident (str ^ (lexeme lexbuf))
            | Uident str-> Uident (str ^ (lexeme lexbuf))
            | Constant str-> Constant (str ^ (lexeme lexbuf))
            | _-> tok
          in
          let loc= { loc with ofs2= lexeme_end lexbuf } in
          tokens loc.idx2 ((tok, loc) :: tl) lexbuf
        else
          let loc = mkloc idx (idx + 1) (lexeme_start lexbuf) (lexeme_end lexbuf) in
          tokens loc.idx2 ((Error, loc) :: acc) lexbuf
      }

and cm_string idx= parse
  | '"'
      { (String (1, true), idx+1) }
  | "\\\""
      { let idx2, terminated= string (idx + 2) false lexbuf in
        (String (1, terminated), idx2)
      }
  | uchar as uchar
      {
        let uChar= Zed_utf8.unsafe_extract uchar 0 in
        if Zed_char.is_combining_mark uChar then
          cm_string idx lexbuf
        else
          let idx2, terminated= string (idx + 1) true lexbuf in
          (String (1, terminated), idx2)
      }
  | eof
      { (String (1, false), idx) }

and comment idx depth combining= parse
  | "(*"
      { comment (idx + 2) (depth + 1) false lexbuf }
  | "*)"
      { if depth = 0 then
          (idx + 2, true)
        else
          comment (idx + 2) (depth - 1) false lexbuf }
  | '"'
      { let idx, terminated = string (idx + 1) false lexbuf in
        if terminated then
          comment idx depth false lexbuf
        else
          (idx, false) }
  | uchar as uchar
      { let uChar= Zed_utf8.unsafe_extract uchar 0 in
        if not combining then
          if Zed_char.is_combining_mark uChar then
            comment (idx + 1) depth false lexbuf
          else
            comment (idx + 1) depth true lexbuf
        else
          if Zed_char.is_combining_mark uChar then
            comment idx depth true lexbuf
          else
            comment (idx + 1) depth true lexbuf
      }
  | eof
      { (idx, false) }

and string idx combining= parse
  | '"'
      { (idx + 1, true) }
  | "\\\""
      { string (idx + 2) false lexbuf }
  | uchar as uchar
      { let uChar= Zed_utf8.unsafe_extract uchar 0 in
        if not combining then
          if Zed_char.is_combining_mark uChar then
            string (idx + 1) false lexbuf
          else
            string (idx + 1) true lexbuf
        else
          if Zed_char.is_combining_mark uChar then
            string idx true lexbuf
          else
            string (idx + 1) true lexbuf
      }
  | eof
      { (idx, false) }

and quoted_string idx tag combining= parse
    | '|' (lowercase* as tag2) '}'
        { let idx = idx + 2 + String.length tag2 in
          if tag = tag2 then
            (idx, true)
          else
            quoted_string idx tag false lexbuf }
    | eof
        { (idx, false) }
    | uchar as uchar
        { let uChar= Zed_utf8.unsafe_extract uchar 0 in
          if not combining then
            if Zed_char.is_combining_mark uChar then
              quoted_string (idx + 1) tag false lexbuf
            else
              quoted_string (idx + 1) tag true lexbuf
          else
            if Zed_char.is_combining_mark uChar then
              quoted_string idx tag true lexbuf
            else
              quoted_string (idx + 1) tag true lexbuf
        }

{
  let lex_string str =
    let _, _, items = tokens 0 [] (Lexing.from_string str) in
    items
}