Files
linguist/samples/OCaml/uutf.ml
2015-03-18 09:58:14 +00:00

811 lines
33 KiB
OCaml

(*---------------------------------------------------------------------------
Copyright 2012 Daniel C. Bünzli. All rights reserved.
Distributed under the BSD3 license, see license at the end of the file.
%%NAME%% release %%VERSION%%
---------------------------------------------------------------------------*)
let io_buffer_size = 65536 (* IO_BUFFER_SIZE 4.0.0 *)
let pp = Format.fprintf
let invalid_encode () = invalid_arg "expected `Await encode"
let invalid_bounds j l =
invalid_arg (Printf.sprintf "invalid bounds (index %d, length %d)" j l)
(* Unsafe string byte manipulations. If you don't believe the author's
invariants, replacing with safe versions makes everything safe in
the module. He won't be upset. *)
let unsafe_chr = Char.unsafe_chr
let unsafe_blit = String.unsafe_blit
let unsafe_array_get = Array.unsafe_get
let unsafe_byte s j = Char.code (String.unsafe_get s j)
let unsafe_set_byte s j byte = String.unsafe_set s j (Char.unsafe_chr byte)
(* Unicode characters *)
type uchar = int
let u_bom = 0xFEFF (* BOM. *)
let u_rep = 0xFFFD (* replacement character. *)
let is_uchar cp =
(0x0000 <= cp && cp <= 0xD7FF) || (0xE000 <= cp && cp <= 0x10FFFF)
let pp_cp ppf cp =
if cp < 0 || cp > 0x10FFFF then pp ppf "U+Invalid(%X)" cp else
if cp <= 0xFFFF then pp ppf "U+%04X" cp else
pp ppf "U+%X" cp
let cp_to_string cp = (* NOT thread safe. *)
pp Format.str_formatter "%a" pp_cp cp; Format.flush_str_formatter ()
(* Unicode encoding schemes *)
type encoding = [ `UTF_8 | `UTF_16 | `UTF_16BE | `UTF_16LE ]
type decoder_encoding = [ encoding | `US_ASCII | `ISO_8859_1 ]
let encoding_of_string s = match String.uppercase s with (* IANA names. *)
| "UTF-8" -> Some `UTF_8
| "UTF-16" -> Some `UTF_16
| "UTF-16LE" -> Some `UTF_16LE
| "UTF-16BE" -> Some `UTF_16BE
| "ANSI_X3.4-1968" | "ISO-IR-6" | "ANSI_X3.4-1986" | "ISO_646.IRV:1991"
| "ASCII" | "ISO646-US" | "US-ASCII" | "US" | "IBM367" | "CP367" | "CSASCII" ->
Some `US_ASCII
| "ISO_8859-1:1987" | "ISO-IR-100" | "ISO_8859-1" | "ISO-8859-1"
| "LATIN1" | "L1" | "IBM819" | "CP819" | "CSISOLATIN1" ->
Some `ISO_8859_1
| _ -> None
let encoding_to_string = function
| `UTF_8 -> "UTF-8" | `UTF_16 -> "UTF-16" | `UTF_16BE -> "UTF-16BE"
| `UTF_16LE -> "UTF-16LE" | `US_ASCII -> "US-ASCII"
| `ISO_8859_1 -> "ISO-8859-1"
(* Base character decoders. They assume enough data. *)
let malformed s j l = `Malformed (String.sub s j l)
let malformed_pair be hi s j l = (* missing or half low surrogate at eoi. *)
let bs1 = String.sub s j l in
let bs0 = String.create 2 in
let j0, j1 = if be then (0, 1) else (1, 0) in
unsafe_set_byte bs0 j0 (hi lsr 8);
unsafe_set_byte bs0 j1 (hi land 0xFF);
`Malformed (bs0 ^ bs1)
let r_us_ascii s j =
(* assert (0 <= j && j < String.length s); *)
let b0 = unsafe_byte s j in
if b0 <= 127 then `Uchar b0 else malformed s j 1
let r_iso_8859_1 s j =
(* assert (0 <= j && j < String.length s); *)
`Uchar (unsafe_byte s j)
let utf_8_len = [| (* uchar byte length according to first UTF-8 byte. *)
1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
1; 1; 1; 1; 1; 1; 1; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
0; 0; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2;
2; 2; 2; 2; 2; 2; 2; 2; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3;
4; 4; 4; 4; 4; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0 |]
let r_utf_8 s j l =
(* assert (0 <= j && 0 <= l && j + l <= String.length s); *)
match l with
| 1 -> `Uchar (unsafe_byte s j)
| 2 ->
let b0 = unsafe_byte s j in let b1 = unsafe_byte s (j + 1) in
if b1 lsr 6 != 0b10 then malformed s j l else
`Uchar (((b0 land 0x1F) lsl 6) lor (b1 land 0x3F))
| 3 ->
let b0 = unsafe_byte s j in let b1 = unsafe_byte s (j + 1) in
let b2 = unsafe_byte s (j + 2) in
let c = `Uchar (((b0 land 0x0F) lsl 12) lor
((b1 land 0x3F) lsl 6) lor
(b2 land 0x3F))
in
if b2 lsr 6 != 0b10 then malformed s j l else
begin match b0 with
| 0xE0 -> if b1 < 0xA0 || 0xBF < b1 then malformed s j l else c
| 0xED -> if b1 < 0x80 || 0x9F < b1 then malformed s j l else c
| _ -> if b1 lsr 6 != 0b10 then malformed s j l else c
end
| 4 ->
let b0 = unsafe_byte s j in let b1 = unsafe_byte s (j + 1) in
let b2 = unsafe_byte s (j + 2) in let b3 = unsafe_byte s (j + 3) in
let c = `Uchar (((b0 land 0x07) lsl 18) lor
((b1 land 0x3F) lsl 12) lor
((b2 land 0x3F) lsl 6) lor
(b3 land 0x3F))
in
if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10 then malformed s j l else
begin match b0 with
| 0xF0 -> if b1 < 0x90 || 0xBF < b1 then malformed s j l else c
| 0xF4 -> if b1 < 0x80 || 0x8F < b1 then malformed s j l else c
| _ -> if b1 lsr 6 != 0b10 then malformed s j l else c
end
| _ -> assert false
let r_utf_16 s j0 j1 = (* May return a high surrogate. *)
(* assert (0 <= j0 && 0 <= j1 && max j0 j1 < String.length s); *)
let b0 = unsafe_byte s j0 in let b1 = unsafe_byte s j1 in
let u = (b0 lsl 8) lor b1 in
if u < 0xD800 || u > 0xDFFF then `Uchar u else
if u > 0xDBFF then malformed s (min j0 j1) 2 else `Hi u
let r_utf_16_lo hi s j0 j1 = (* Combines [hi] with a low surrogate. *)
(* assert (0 <= j0 && 0 <= j1 && max j0 j1 < String.length s); *)
let b0 = unsafe_byte s j0 in
let b1 = unsafe_byte s j1 in
let lo = (b0 lsl 8) lor b1 in
if lo < 0xDC00 || lo > 0xDFFF
then malformed_pair (j0 < j1 (* true => be *)) hi s (min j0 j1) 2
else `Uchar ((((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000)
let r_encoding s j l = (* guess encoding with max. 3 bytes. *)
(* assert (0 <= j && 0 <= l && j + l <= String.length s) *)
let some i = if i < l then Some (unsafe_byte s (j + i)) else None in
match (some 0), (some 1), (some 2) with
| Some 0xEF, Some 0xBB, Some 0xBF -> `UTF_8 `BOM
| Some 0xFE, Some 0xFF, _ -> `UTF_16BE `BOM
| Some 0xFF, Some 0xFE, _ -> `UTF_16LE `BOM
| Some 0x00, Some p, _ when p > 0 -> `UTF_16BE (`ASCII p)
| Some p, Some 0x00, _ when p > 0 -> `UTF_16LE (`ASCII p)
| Some u, _, _ when utf_8_len.(u) <> 0 -> `UTF_8 `Decode
| Some _, Some _, _ -> `UTF_16BE `Decode
| Some _, None , None -> `UTF_8 `Decode
| None , None , None -> `UTF_8 `End
| None , Some _, _ -> assert false
| Some _, None , Some _ -> assert false
| None , None , Some _ -> assert false
(* Decode *)
type src = [ `Channel of in_channel | `String of string | `Manual ]
type nln = [ `ASCII of uchar | `NLF of uchar | `Readline of uchar ]
type decode = [ `Await | `End | `Malformed of string | `Uchar of uchar]
let pp_decode ppf = function
| `Uchar u -> pp ppf "@[`Uchar %a@]" pp_cp u
| `End -> pp ppf "`End"
| `Await -> pp ppf "`Await"
| `Malformed bs ->
let l = String.length bs in
pp ppf "@[`Malformed (";
if l > 0 then pp ppf "%02X" (Char.code (bs.[0]));
for i = 1 to l - 1 do pp ppf " %02X" (Char.code (bs.[i])) done;
pp ppf ")@]"
type decoder =
{ src : src; (* input source. *)
mutable encoding : decoder_encoding; (* decoded encoding. *)
nln : nln option; (* newline normalization (if any). *)
nl : int; (* newline normalization character. *)
mutable i : string; (* current input chunk. *)
mutable i_pos : int; (* input current position. *)
mutable i_max : int; (* input maximal position. *)
t : string; (* four bytes temporary buffer for overlapping reads. *)
mutable t_len : int; (* current byte length of [t]. *)
mutable t_need : int; (* number of bytes needed in [t]. *)
mutable removed_bom : bool; (* [true] if an initial BOM was removed. *)
mutable last_cr : bool; (* [true] if last char was CR. *)
mutable line : int; (* line number. *)
mutable col : int; (* column number. *)
mutable byte_count : int; (* byte count. *)
mutable count : int; (* char count. *)
mutable pp : (* decoder post-processor for BOM, position and nln. *)
decoder -> [ `Malformed of string | `Uchar of uchar ] -> decode;
mutable k : decoder -> decode } (* decoder continuation. *)
(* On decodes that overlap two (or more) [d.i] buffers, we use [t_fill] to copy
the input data to [d.t] and decode from there. If the [d.i] buffers are not
too small this is faster than continuation based byte per byte writes.
End of input (eoi) is signalled by [d.i_pos = 0] and [d.i_max = min_int]
which implies that [i_rem d < 0] is [true]. *)
let i_rem d = d.i_max - d.i_pos + 1 (* remaining bytes to read in [d.i]. *)
let eoi d = d.i <- ""; d.i_pos <- 0; d.i_max <- min_int (* set eoi in [d]. *)
let src d s j l = (* set [d.i] with [s]. *)
if (j < 0 || l < 0 || j + l > String.length s) then invalid_bounds j l else
if (l = 0) then eoi d else
(d.i <- s; d.i_pos <- j; d.i_max <- j + l - 1)
let refill k d = match d.src with (* get new input in [d.i] and [k]ontinue. *)
| `Manual -> d.k <- k; `Await
| `String _ -> eoi d; k d
| `Channel ic ->
let rc = input ic d.i 0 (String.length d.i) in
(src d d.i 0 rc; k d)
let t_need d need = d.t_len <- 0; d.t_need <- need
let rec t_fill k d = (* get [d.t_need] bytes (or less if eoi) in [i.t]. *)
let blit d l =
unsafe_blit d.i d.i_pos d.t d.t_len (* write pos. *) l;
d.i_pos <- d.i_pos + l; d.t_len <- d.t_len + l;
in
let rem = i_rem d in
if rem < 0 (* eoi *) then k d else
let need = d.t_need - d.t_len in
if rem < need then (blit d rem; refill (t_fill k) d) else (blit d need; k d)
let ret k v byte_count d = (* return post-processed [v]. *)
d.k <- k; d.byte_count <- d.byte_count + byte_count; d.pp d v
(* Decoders. *)
let rec decode_us_ascii d =
let rem = i_rem d in
if rem <= 0 then (if rem < 0 then `End else refill decode_us_ascii d) else
let j = d.i_pos in
d.i_pos <- d.i_pos + 1; ret decode_us_ascii (r_us_ascii d.i j) 1 d
let rec decode_iso_8859_1 d =
let rem = i_rem d in
if rem <= 0 then (if rem < 0 then `End else refill decode_iso_8859_1 d) else
let j = d.i_pos in
d.i_pos <- d.i_pos + 1; ret decode_iso_8859_1 (r_iso_8859_1 d.i j) 1 d
(* UTF-8 decoder *)
let rec t_decode_utf_8 d = (* decode from [d.t]. *)
if d.t_len < d.t_need
then ret decode_utf_8 (malformed d.t 0 d.t_len) d.t_len d
else ret decode_utf_8 (r_utf_8 d.t 0 d.t_len) d.t_len d
and decode_utf_8 d =
let rem = i_rem d in
if rem <= 0 then (if rem < 0 then `End else refill decode_utf_8 d) else
let need = unsafe_array_get utf_8_len (unsafe_byte d.i d.i_pos) in
if rem < need then (t_need d need; t_fill t_decode_utf_8 d) else
let j = d.i_pos in
if need = 0
then (d.i_pos <- d.i_pos + 1; ret decode_utf_8 (malformed d.i j 1) 1 d)
else (d.i_pos <- d.i_pos + need; ret decode_utf_8 (r_utf_8 d.i j need) need d)
(* UTF-16BE decoder *)
let rec t_decode_utf_16be_lo hi d = (* decode from [d.t]. *)
let bcount = d.t_len + 2 (* hi count *) in
if d.t_len < d.t_need
then ret decode_utf_16be (malformed_pair true hi d.t 0 d.t_len) bcount d
else ret decode_utf_16be (r_utf_16_lo hi d.t 0 1) bcount d
and t_decode_utf_16be d = (* decode from [d.t]. *)
if d.t_len < d.t_need
then ret decode_utf_16be (malformed d.t 0 d.t_len) d.t_len d
else decode_utf_16be_lo (r_utf_16 d.t 0 1) d
and decode_utf_16be_lo v d = match v with
| `Uchar _ | `Malformed _ as v -> ret decode_utf_16be v 2 d
| `Hi hi ->
let rem = i_rem d in
if rem < 2 then (t_need d 2; t_fill (t_decode_utf_16be_lo hi) d) else
let j = d.i_pos in
d.i_pos <- d.i_pos + 2;
ret decode_utf_16be (r_utf_16_lo hi d.i j (j + 1)) 4 d
and decode_utf_16be d =
let rem = i_rem d in
if rem <= 0 then (if rem < 0 then `End else refill decode_utf_16be d) else
if rem < 2 then (t_need d 2; t_fill t_decode_utf_16be d) else
let j = d.i_pos in
d.i_pos <- d.i_pos + 2; decode_utf_16be_lo (r_utf_16 d.i j (j + 1)) d
(* UTF-16LE decoder, same as UTF-16BE with byte swapped. *)
let rec t_decode_utf_16le_lo hi d = (* decode from [d.t]. *)
let bcount = d.t_len + 2 (* hi count *) in
if d.t_len < d.t_need
then ret decode_utf_16le (malformed_pair false hi d.t 0 d.t_len) bcount d
else ret decode_utf_16le (r_utf_16_lo hi d.t 1 0) bcount d
and t_decode_utf_16le d = (* decode from [d.t]. *)
if d.t_len < d.t_need
then ret decode_utf_16le (malformed d.t 0 d.t_len) d.t_len d
else decode_utf_16le_lo (r_utf_16 d.t 1 0) d
and decode_utf_16le_lo v d = match v with
| `Uchar _ | `Malformed _ as v -> ret decode_utf_16le v 2 d
| `Hi hi ->
let rem = i_rem d in
if rem < 2 then (t_need d 2; t_fill (t_decode_utf_16le_lo hi) d) else
let j = d.i_pos in
d.i_pos <- d.i_pos + 2;
ret decode_utf_16le (r_utf_16_lo hi d.i (j + 1) j) 4 d
and decode_utf_16le d =
let rem = i_rem d in
if rem <= 0 then (if rem < 0 then `End else refill decode_utf_16le d) else
if rem < 2 then (t_need d 2; t_fill t_decode_utf_16le d) else
let j = d.i_pos in
d.i_pos <- d.i_pos + 2; decode_utf_16le_lo (r_utf_16 d.i (j + 1) j) d
(* Encoding guessing. The guess is simple but starting the decoder
after is tedious, uutf's decoders are not designed to put bytes
back in the stream. *)
let guessed_utf_8 d = (* start decoder after `UTF_8 guess. *)
let b3 d = (* handles the third read byte. *)
let b3 = unsafe_byte d.t 2 in
match utf_8_len.(b3) with
| 0 -> ret decode_utf_8 (malformed d.t 2 1) 1 d
| n ->
d.t_need <- n; d.t_len <- 1; unsafe_set_byte d.t 0 b3;
t_fill t_decode_utf_8 d
in
let b2 d = (* handle second read byte. *)
let b2 = unsafe_byte d.t 1 in
let b3 = if d.t_len > 2 then b3 else decode_utf_8 (* decodes `End *) in
match utf_8_len.(b2) with
| 0 -> ret b3 (malformed d.t 1 1) 1 d
| 1 -> ret b3 (r_utf_8 d.t 1 1) 1 d
| n -> (* copy d.t.(1-2) to d.t.(0-1) and decode *)
d.t_need <- n;
unsafe_set_byte d.t 0 b2;
if (d.t_len < 3) then d.t_len <- 1 else
(d.t_len <- 2; unsafe_set_byte d.t 1 (unsafe_byte d.t 2); );
t_fill t_decode_utf_8 d
in
let b1 = unsafe_byte d.t 0 in (* handle first read byte. *)
let b2 = if d.t_len > 1 then b2 else decode_utf_8 (* decodes `End *) in
match utf_8_len.(b1) with
| 0 -> ret b2 (malformed d.t 0 1) 1 d
| 1 -> ret b2 (r_utf_8 d.t 0 1) 1 d
| 2 ->
if d.t_len < 2 then ret decode_utf_8 (malformed d.t 0 1) 1 d else
if d.t_len < 3 then ret decode_utf_8 (r_utf_8 d.t 0 2) 2 d else
ret b3 (r_utf_8 d.t 0 2) 2 d
| 3 ->
if d.t_len < 3
then ret decode_utf_8 (malformed d.t 0 d.t_len) d.t_len d
else ret decode_utf_8 (r_utf_8 d.t 0 3) 3 d
| 4 ->
if d.t_len < 3
then ret decode_utf_8 (malformed d.t 0 d.t_len) d.t_len d
else (d.t_need <- 4; t_fill t_decode_utf_8 d)
| n -> assert false
let guessed_utf_16 d be v = (* start decoder after `UTF_16{BE,LE} guess. *)
let decode_utf_16, t_decode_utf_16, t_decode_utf_16_lo, j0, j1 =
if be then decode_utf_16be, t_decode_utf_16be, t_decode_utf_16be_lo, 0, 1
else decode_utf_16le, t_decode_utf_16le, t_decode_utf_16le_lo, 1, 0
in
let b3 k d =
if d.t_len < 3 then decode_utf_16 d (* decodes `End *) else
begin (* copy d.t.(2) to d.t.(0) and decode. *)
d.t_need <- 2; d.t_len <- 1;
unsafe_set_byte d.t 0 (unsafe_byte d.t 2);
t_fill k d
end
in
match v with
| `BOM -> ret (b3 t_decode_utf_16) (`Uchar u_bom) 2 d
| `ASCII u -> ret (b3 t_decode_utf_16) (`Uchar u) 2 d
| `Decode ->
match r_utf_16 d.t j0 j1 with
| `Malformed _ | `Uchar _ as v -> ret (b3 t_decode_utf_16) v 2 d
| `Hi hi ->
if d.t_len < 3
then ret decode_utf_16 (malformed_pair be hi "" 0 0) d.t_len d
else (b3 (t_decode_utf_16_lo hi)) d
let guess_encoding d = (* guess encoding and start decoder. *)
let setup d = match r_encoding d.t 0 d.t_len with
| `UTF_8 r ->
d.encoding <- `UTF_8; d.k <- decode_utf_8;
begin match r with
| `BOM -> ret decode_utf_8 (`Uchar u_bom) 3 d
| `Decode -> guessed_utf_8 d
| `End -> `End
end
| `UTF_16BE r ->
d.encoding <- `UTF_16BE; d.k <- decode_utf_16be; guessed_utf_16 d true r
| `UTF_16LE r ->
d.encoding <- `UTF_16LE; d.k <- decode_utf_16le; guessed_utf_16 d false r
in
(t_need d 3; t_fill setup d)
(* Character post-processors. Used for BOM handling, newline
normalization and position tracking. The [pp_remove_bom] is only
used for the first character to remove a possible initial BOM and
handle UTF-16 endianness recognition. *)
let nline d = d.col <- 0; d.line <- d.line + 1 (* inlined. *)
let ncol d = d.col <- d.col + 1 (* inlined. *)
let ncount d = d.count <- d.count + 1 (* inlined. *)
let cr d b = d.last_cr <- b (* inlined. *)
let pp_remove_bom utf16 pp d = function(* removes init. BOM, handles UTF-16. *)
| `Uchar 0xFEFF (* BOM *) ->
if utf16 then (d.encoding <- `UTF_16BE; d.k <- decode_utf_16be);
d.removed_bom <- true; d.pp <- pp; d.k d
| `Uchar 0xFFFE (* BOM reversed from decode_utf_16be *) when utf16 ->
d.encoding <- `UTF_16LE; d.k <- decode_utf_16le;
d.removed_bom <- true; d.pp <- pp; d.k d
| `Malformed _ | `Uchar _ as v ->
d.removed_bom <- false; d.pp <- pp; d.pp d v
let pp_nln_none d = function
| `Uchar 0x000A (* LF *) as v ->
let last_cr = d.last_cr in
cr d false; ncount d; if last_cr then v else (nline d; v)
| `Uchar 0x000D (* CR *) as v -> cr d true; ncount d; nline d; v
| `Uchar (0x0085 | 0x000C | 0x2028 | 0x2029) (* NEL | FF | LS | PS *) as v ->
cr d false; ncount d; nline d; v
| `Uchar _ | `Malformed _ as v -> cr d false; ncount d; ncol d; v
let pp_nln_readline d = function
| `Uchar 0x000A (* LF *) ->
let last_cr = d.last_cr in
cr d false; if last_cr then d.k d else (ncount d; nline d; `Uchar d.nl)
| `Uchar 0x000D (* CR *) -> cr d true; ncount d; nline d; `Uchar d.nl
| `Uchar (0x0085 | 0x000C | 0x2028 | 0x2029) (* NEL | FF | LS | PS *) ->
cr d false; ncount d; nline d; `Uchar d.nl
| `Uchar _ | `Malformed _ as v -> cr d false; ncount d; ncol d; v
let pp_nln_nlf d = function
| `Uchar 0x000A (* LF *) ->
let last_cr = d.last_cr in
cr d false; if last_cr then d.k d else (ncount d; nline d; `Uchar d.nl)
| `Uchar 0x000D (* CR *) -> cr d true; ncount d; nline d; `Uchar d.nl
| `Uchar 0x0085 (* NEL *) -> cr d false; ncount d; nline d; `Uchar d.nl
| `Uchar (0x000C | 0x2028 | 0x2029) as v (* FF | LS | PS *) ->
cr d false; ncount d; nline d; v
| `Uchar _ | `Malformed _ as v -> cr d false; ncount d; ncol d; v
let pp_nln_ascii d = function
| `Uchar 0x000A (* LF *) ->
let last_cr = d.last_cr in
cr d false; if last_cr then d.k d else (ncount d; nline d; `Uchar d.nl)
| `Uchar 0x000D (* CR *) -> cr d true; ncount d; nline d; `Uchar d.nl
| `Uchar (0x0085 | 0x000C | 0x2028 | 0x2029) as v (* NEL | FF | LS | PS *) ->
cr d false; ncount d; nline d; v
| `Uchar _ | `Malformed _ as v -> cr d false; ncount d; ncol d; v
let decode_fun = function
| `UTF_8 -> decode_utf_8
| `UTF_16 -> decode_utf_16be (* see [pp_remove_bom]. *)
| `UTF_16BE -> decode_utf_16be
| `UTF_16LE -> decode_utf_16le
| `US_ASCII -> decode_us_ascii
| `ISO_8859_1 -> decode_iso_8859_1
let decoder ?nln ?encoding src =
let pp, nl = match nln with
| None -> pp_nln_none, 0x000A (* not used. *)
| Some (`ASCII nl) -> pp_nln_ascii, nl
| Some (`NLF nl) -> pp_nln_nlf, nl
| Some (`Readline nl) -> pp_nln_readline, nl
in
let encoding, k = match encoding with
| None -> `UTF_8, guess_encoding
| Some e -> (e :> decoder_encoding), decode_fun e
in
let i, i_pos, i_max = match src with
| `Manual -> "", 1, 0 (* implies src_rem d = 0. *)
| `Channel _ -> String.create io_buffer_size, 1, 0 (* idem. *)
| `String s -> s, 0, String.length s - 1
in
{ src = (src :> src); encoding; nln = (nln :> nln option); nl;
i; i_pos; i_max; t = String.create 4; t_len = 0; t_need = 0;
removed_bom = false; last_cr = false; line = 1; col = 0;
byte_count = 0; count = 0;
pp = pp_remove_bom (encoding = `UTF_16) pp; k }
let decode d = d.k d
let decoder_line d = d.line
let decoder_col d = d.col
let decoder_byte_count d = d.byte_count
let decoder_count d = d.count
let decoder_removed_bom d = d.removed_bom
let decoder_src d = d.src
let decoder_nln d = d.nln
let decoder_encoding d = d.encoding
let set_decoder_encoding d e =
d.encoding <- (e :> decoder_encoding); d.k <- decode_fun e
(* Encode *)
type dst = [ `Channel of out_channel | `Buffer of Buffer.t | `Manual ]
type encode = [ `Await | `End | `Uchar of uchar ]
type encoder =
{ dst : dst; (* output destination. *)
encoding : encoding; (* encoded encoding. *)
mutable o : string; (* current output chunk. *)
mutable o_pos : int; (* next output position to write. *)
mutable o_max : int; (* maximal output position to write. *)
t : string; (* four bytes buffer for overlapping writes. *)
mutable t_pos : int; (* next position to read in [t]. *)
mutable t_max : int; (* maximal position to read in [t]. *)
mutable k : (* encoder continuation. *)
encoder -> encode -> [ `Ok | `Partial ] }
(* On encodes that overlap two (or more) [e.o] buffers, we encode the
character to the temporary buffer [o.t] and continue with
[tmp_flush] to write this data on the different [e.o] buffers. If
the [e.o] buffers are not too small this is faster than
continuation based byte per byte writes. *)
let o_rem e = e.o_max - e.o_pos + 1 (* remaining bytes to write in [e.o]. *)
let dst e s j l = (* set [e.o] with [s]. *)
if (j < 0 || l < 0 || j + l > String.length s) then invalid_bounds j l;
e.o <- s; e.o_pos <- j; e.o_max <- j + l - 1
let partial k e = function `Await -> k e | `Uchar _ | `End -> invalid_encode ()
let flush k e = match e.dst with(* get free storage in [d.o] and [k]ontinue. *)
| `Manual -> e.k <- partial k; `Partial
| `Buffer b -> Buffer.add_substring b e.o 0 e.o_pos; e.o_pos <- 0; k e
| `Channel oc -> output oc e.o 0 e.o_pos; e.o_pos <- 0; k e
let t_range e max = e.t_pos <- 0; e.t_max <- max
let rec t_flush k e = (* flush [d.t] up to [d.t_max] in [d.i]. *)
let blit e l =
unsafe_blit e.t e.t_pos e.o e.o_pos l;
e.o_pos <- e.o_pos + l; e.t_pos <- e.t_pos + l
in
let rem = o_rem e in
let len = e.t_max - e.t_pos + 1 in
if rem < len then (blit e rem; flush (t_flush k) e) else (blit e len; k e)
(* Encoders. *)
let rec encode_utf_8 e v =
let k e = e.k <- encode_utf_8; `Ok in
match v with
| `Await -> k e
| `End -> flush k e
| `Uchar u as v ->
let rem = o_rem e in
if u <= 0x007F then
if rem < 1 then flush (fun e -> encode_utf_8 e v) e else
(unsafe_set_byte e.o e.o_pos u; e.o_pos <- e.o_pos + 1; k e)
else if u <= 0x07FF then
begin
let s, j, k =
if rem < 2 then (t_range e 1; e.t, 0, t_flush k) else
let j = e.o_pos in (e.o_pos <- e.o_pos + 2; e.o, j, k)
in
unsafe_set_byte s j (0xC0 lor (u lsr 6));
unsafe_set_byte s (j + 1) (0x80 lor (u land 0x3F));
k e
end
else if u <= 0xFFFF then
begin
let s, j, k =
if rem < 3 then (t_range e 2; e.t, 0, t_flush k) else
let j = e.o_pos in (e.o_pos <- e.o_pos + 3; e.o, j, k)
in
unsafe_set_byte s j (0xE0 lor (u lsr 12));
unsafe_set_byte s (j + 1) (0x80 lor ((u lsr 6) land 0x3F));
unsafe_set_byte s (j + 2) (0x80 lor (u land 0x3F));
k e
end
else
begin
let s, j, k =
if rem < 4 then (t_range e 3; e.t, 0, t_flush k) else
let j = e.o_pos in (e.o_pos <- e.o_pos + 4; e.o, j, k)
in
unsafe_set_byte s j (0xF0 lor (u lsr 18));
unsafe_set_byte s (j + 1) (0x80 lor ((u lsr 12) land 0x3F));
unsafe_set_byte s (j + 2) (0x80 lor ((u lsr 6) land 0x3F));
unsafe_set_byte s (j + 3) (0x80 lor (u land 0x3F));
k e
end
let rec encode_utf_16be e v =
let k e = e.k <- encode_utf_16be; `Ok in
match v with
| `Await -> k e
| `End -> flush k e
| `Uchar u ->
let rem = o_rem e in
if u < 0x10000 then
begin
let s, j, k =
if rem < 2 then (t_range e 1; e.t, 0, t_flush k) else
let j = e.o_pos in (e.o_pos <- e.o_pos + 2; e.o, j, k)
in
unsafe_set_byte s j (u lsr 8);
unsafe_set_byte s (j + 1) (u land 0xFF);
k e
end else begin
let s, j, k =
if rem < 4 then (t_range e 3; e.t, 0, t_flush k) else
let j = e.o_pos in (e.o_pos <- e.o_pos + 4; e.o, j, k)
in
let u' = u - 0x10000 in
let hi = (0xD800 lor (u' lsr 10)) in
let lo = (0xDC00 lor (u' land 0x3FF)) in
unsafe_set_byte s j (hi lsr 8);
unsafe_set_byte s (j + 1) (hi land 0xFF);
unsafe_set_byte s (j + 2) (lo lsr 8);
unsafe_set_byte s (j + 3) (lo land 0xFF);
k e
end
let rec encode_utf_16le e v = (* encode_uft_16be with bytes swapped. *)
let k e = e.k <- encode_utf_16le; `Ok in
match v with
| `Await -> k e
| `End -> flush k e
| `Uchar u ->
let rem = o_rem e in
if u < 0x10000 then
begin
let s, j, k =
if rem < 2 then (t_range e 1; e.t, 0, t_flush k) else
let j = e.o_pos in (e.o_pos <- e.o_pos + 2; e.o, j, k)
in
unsafe_set_byte s j (u land 0xFF);
unsafe_set_byte s (j + 1) (u lsr 8);
k e
end
else
begin
let s, j, k =
if rem < 4 then (t_range e 3; e.t, 0, t_flush k) else
let j = e.o_pos in (e.o_pos <- e.o_pos + 4; e.o, j, k)
in
let u' = u - 0x10000 in
let hi = (0xD800 lor (u' lsr 10)) in
let lo = (0xDC00 lor (u' land 0x3FF)) in
unsafe_set_byte s j (hi land 0xFF);
unsafe_set_byte s (j + 1) (hi lsr 8);
unsafe_set_byte s (j + 2) (lo land 0xFF);
unsafe_set_byte s (j + 3) (lo lsr 8);
k e
end
let encode_fun = function
| `UTF_8 -> encode_utf_8
| `UTF_16 -> encode_utf_16be
| `UTF_16BE -> encode_utf_16be
| `UTF_16LE -> encode_utf_16le
let encoder encoding dst =
let o, o_pos, o_max = match dst with
| `Manual -> "", 1, 0 (* implies o_rem e = 0. *)
| `Buffer _
| `Channel _ -> String.create io_buffer_size, 0, io_buffer_size - 1
in
{ dst = (dst :> dst); encoding = (encoding :> encoding); o; o_pos; o_max;
t = String.create 4; t_pos = 1; t_max = 0; k = encode_fun encoding}
let encode e v = e.k e (v :> encode)
let encoder_encoding e = e.encoding
let encoder_dst e = e.dst
(* Manual sources and destinations. *)
module Manual = struct
let src = src
let dst = dst
let dst_rem = o_rem
end
(* Strings folders and Buffer encoders *)
module String = struct
let encoding_guess s = match r_encoding s 0 (max (String.length s) 3) with
| `UTF_8 d -> `UTF_8, (d = `BOM)
| `UTF_16BE d -> `UTF_16BE, (d = `BOM)
| `UTF_16LE d -> `UTF_16LE, (d = `BOM)
type 'a folder =
'a -> int -> [ `Uchar of uchar | `Malformed of string ] -> 'a
let fold_utf_8 f acc s =
let rec loop acc f s i l =
if i = l then acc else
let need = unsafe_array_get utf_8_len (unsafe_byte s i) in
if need = 0 then loop (f acc i (malformed s i 1)) f s (i + 1) l else
let rem = l - i in
if rem < need then f acc i (malformed s i rem) else
loop (f acc i (r_utf_8 s i need)) f s (i + need) l
in
loop acc f s 0 (String.length s)
let fold_utf_16be f acc s =
let rec loop acc f s i l =
if i = l then acc else
let rem = l - i in
if rem < 2 then f acc i (malformed s i 1) else
match r_utf_16 s i (i + 1) with
| `Uchar _ | `Malformed _ as v -> loop (f acc i v) f s (i + 2) l
| `Hi hi ->
if rem < 4 then f acc i (malformed s i rem) else
loop (f acc i (r_utf_16_lo hi s (i + 2) (i + 3))) f s (i + 4) l
in
loop acc f s 0 (String.length s)
let fold_utf_16le f acc s = (* [fold_utf_16be], bytes swapped. *)
let rec loop acc f s i l =
if i = l then acc else
let rem = l - i in
if rem < 2 then f acc i (malformed s i 1) else
match r_utf_16 s (i + 1) i with
| `Uchar _ | `Malformed _ as v -> loop (f acc i v) f s (i + 2) l
| `Hi hi ->
if rem < 4 then f acc i (malformed s i rem) else
loop (f acc i (r_utf_16_lo hi s (i + 3) (i + 2))) f s (i + 4) l
in
loop acc f s 0 (String.length s)
end
module Buffer = struct
let add_utf_8 b u =
let w byte = Buffer.add_char b (unsafe_chr byte) in (* inlined. *)
if u <= 0x007F then
(w u)
else if u <= 0x07FF then
(w (0xC0 lor (u lsr 6));
w (0x80 lor (u land 0x3F)))
else if u <= 0xFFFF then
(w (0xE0 lor (u lsr 12));
w (0x80 lor ((u lsr 6) land 0x3F));
w (0x80 lor (u land 0x3F)))
else
(w (0xF0 lor (u lsr 18));
w (0x80 lor ((u lsr 12) land 0x3F));
w (0x80 lor ((u lsr 6) land 0x3F));
w (0x80 lor (u land 0x3F)))
let add_utf_16be b u =
let w byte = Buffer.add_char b (unsafe_chr byte) in (* inlined. *)
if u < 0x10000 then (w (u lsr 8); w (u land 0xFF)) else
let u' = u - 0x10000 in
let hi = (0xD800 lor (u' lsr 10)) in
let lo = (0xDC00 lor (u' land 0x3FF)) in
w (hi lsr 8); w (hi land 0xFF);
w (lo lsr 8); w (lo land 0xFF)
let add_utf_16le b u = (* swapped add_utf_16be. *)
let w byte = Buffer.add_char b (unsafe_chr byte) in (* inlined. *)
if u < 0x10000 then (w (u land 0xFF); w (u lsr 8)) else
let u' = u - 0x10000 in
let hi = (0xD800 lor (u' lsr 10)) in
let lo = (0xDC00 lor (u' land 0x3FF)) in
w (hi land 0xFF); w (hi lsr 8);
w (lo land 0xFF); w (lo lsr 8)
end
(*---------------------------------------------------------------------------
Copyright 2012 Daniel C. Bünzli
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
3. Neither the name of Daniel C. Bünzli nor the names of
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
---------------------------------------------------------------------------*)