From e727a180efec841e6ac9dc57fe75d60d76e26059 Mon Sep 17 00:00:00 2001 From: xenia Date: Thu, 25 Apr 2024 02:07:52 -0400 Subject: [PATCH] refactor, add bytes --- ppx/ppx_unicode.ml | 86 +++++++++++++++++++++------------------- test/test_ppx_unicode.ml | 6 ++- 2 files changed, 51 insertions(+), 41 deletions(-) diff --git a/ppx/ppx_unicode.ml b/ppx/ppx_unicode.ml index 8fb0c0d..93cc739 100644 --- a/ppx/ppx_unicode.ml +++ b/ppx/ppx_unicode.ml @@ -1,61 +1,67 @@ open Ppxlib -let has_attr name attrs = - List.find_opt (fun attr -> attr.attr_name.txt = name) attrs - |> Option.is_some +let attr_filter attr = + let n = attr.attr_name.txt in + n = "uchar" || n = "unicode.uchar" || n = "bytes" || n = "unicode.bytes" + || n = "utf8" || n = "unicode.utf8" + +let attr_uchar_filter attr = + let n = attr.attr_name.txt in + n = "uchar" || n = "unicode.uchar" + +let attr_bytes_filter attr = + let n = attr.attr_name.txt in + n = "bytes" || n = "unicode.bytes" let decode_single_uchar_opt str = if String.is_valid_utf_8 str then let res = String.get_utf_8_uchar str 0 in if (Uchar.utf_decode_is_valid res) && (Uchar.utf_decode_length res == String.length str) then - Some (Uchar.utf_decode_uchar res |> Uchar.to_int) + Some (Uchar.utf_decode_uchar res |> Uchar.to_int |> Int.to_string) else None else None -let nfc_mapper = object +let utf8_attr_mapper = object inherit Ast_traverse.map as super method! expression = fun { pexp_desc; pexp_loc; pexp_loc_stack; pexp_attributes } -> - if has_attr "uchar" pexp_attributes then - let pexp_attributes = - List.filter (fun attr -> attr.attr_name.txt <> "uchar") pexp_attributes in - match pexp_desc with - | Pexp_constant (Pconst_string (value, loc, _)) -> - begin match decode_single_uchar_opt value with - | Some cp -> - let cp = Pconst_integer (Int.to_string cp, None) in - let cp = { - pexp_desc = Pexp_constant cp; - pexp_loc; pexp_loc_stack; pexp_attributes - } in - [%expr Uchar.of_int [%e cp]] + match pexp_desc with + | Pexp_constant (Pconst_string (value, str_loc, delim)) -> + begin + let loc = pexp_loc in + let attrs = pexp_attributes in + let pexp_attributes = List.filter (fun attr -> not @@ attr_filter attr) attrs in + let make_expr pexp_desc = + { pexp_desc; pexp_loc; pexp_loc_stack; pexp_attributes } in + let make_err msg = + make_expr @@ Pexp_extension (Location.error_extensionf ~loc msg) in + if List.find_opt attr_uchar_filter attrs |> Option.is_some then + match decode_single_uchar_opt value with + | Some value -> + let value = Pexp_constant (Pconst_integer (value, None)) in + let value = { pexp_desc = value; pexp_loc; pexp_loc_stack; pexp_attributes = [] } in + { [%expr Uchar.of_int [%e value]] with pexp_attributes } | None -> - let err_ext = - Pexp_extension begin Location.error_extensionf ~loc:loc - "[@uchar] string must consist of a single valid utf-8 codepoint" end - in { pexp_desc = err_ext; pexp_loc; pexp_loc_stack; pexp_attributes } - end - | _ -> - let err_ext = - Pexp_extension begin Location.error_extensionf ~loc:pexp_loc - "[@uchar] attribute must be on a string constant" end - in { pexp_desc = err_ext; pexp_loc; pexp_loc_stack; pexp_attributes } - else - super#expression { pexp_desc; pexp_loc; pexp_loc_stack; pexp_attributes } - - method! constant = function - | Pconst_string (value, loc, delim) -> - if String.is_valid_utf_8 value then - Pconst_string ((Uunf_string.normalize_utf_8 `NFC value), loc, delim) - else - Location.raise_errorf ~loc "invalid utf-8 in string literal" - | const -> super#constant const + make_err "[@uchar] must be a single utf-8 codepoint literal" + else if List.find_opt attr_bytes_filter attrs |> Option.is_some then + let value_expr = Pexp_constant (Pconst_string (value, str_loc, delim)) in + let value_expr = + { pexp_desc = value_expr; pexp_loc; pexp_loc_stack; pexp_attributes = [] } in + { [%expr Bytes.of_string [%e value_expr]] with pexp_attributes } + else if String.is_valid_utf_8 value then + let value = Uunf_string.normalize_utf_8 `NFC value in + make_expr @@ Pexp_constant (Pconst_string (value, str_loc, delim)) + else + make_err "string literal is not valid utf-8" + end + | _ -> super#expression { pexp_desc; pexp_loc; pexp_loc_stack; pexp_attributes } end + let () = Driver.register_transformation - ~impl:nfc_mapper#structure - "ppx_unicode" + ~impl:utf8_attr_mapper#structure + "unicode" diff --git a/test/test_ppx_unicode.ml b/test/test_ppx_unicode.ml index 923edc6..7e1a16f 100644 --- a/test/test_ppx_unicode.ml +++ b/test/test_ppx_unicode.ml @@ -1,3 +1,7 @@ let () = let a = "å"[@uchar] in - ignore a + let b = Buffer.create 16 in + Buffer.add_utf_8_uchar b a; + Buffer.add_utf_8_uchar b a; + let str = Buffer.to_bytes b |> String.of_bytes in + Printf.printf "meow meow: %s\n%!" str