mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			811 lines
		
	
	
		
			33 KiB
		
	
	
	
		
			OCaml
		
	
	
	
	
	
			
		
		
	
	
			811 lines
		
	
	
		
			33 KiB
		
	
	
	
		
			OCaml
		
	
	
	
	
	
(*---------------------------------------------------------------------------
 | 
						|
   Copyright 2012 Daniel C. Bünzli. All rights reserved.
 | 
						|
   Distributed under the BSD3 license, see license at the end of the file.
 | 
						|
   %%NAME%% release %%VERSION%%
 | 
						|
  ---------------------------------------------------------------------------*)
 | 
						|
 | 
						|
let io_buffer_size = 65536                           (* IO_BUFFER_SIZE 4.0.0 *)
 | 
						|
 | 
						|
let pp = Format.fprintf
 | 
						|
let invalid_encode () = invalid_arg "expected `Await encode"
 | 
						|
let invalid_bounds j l =
 | 
						|
  invalid_arg (Printf.sprintf "invalid bounds (index %d, length %d)" j l)
 | 
						|
 | 
						|
(* Unsafe string byte manipulations. If you don't believe the author's
 | 
						|
   invariants, replacing with safe versions makes everything safe in
 | 
						|
   the module. He won't be upset. *)
 | 
						|
 | 
						|
let unsafe_chr = Char.unsafe_chr
 | 
						|
let unsafe_blit = String.unsafe_blit
 | 
						|
let unsafe_array_get = Array.unsafe_get
 | 
						|
let unsafe_byte s j = Char.code (String.unsafe_get s j)
 | 
						|
let unsafe_set_byte s j byte = String.unsafe_set s j (Char.unsafe_chr byte)
 | 
						|
 | 
						|
(* Unicode characters *)
 | 
						|
 | 
						|
type uchar = int
 | 
						|
let u_bom = 0xFEFF                                                   (* BOM. *)
 | 
						|
let u_rep = 0xFFFD                                 (* replacement character. *)
 | 
						|
let is_uchar cp =
 | 
						|
  (0x0000 <= cp && cp <= 0xD7FF) || (0xE000 <= cp && cp <= 0x10FFFF)
 | 
						|
 | 
						|
let pp_cp ppf cp =
 | 
						|
  if cp < 0 || cp > 0x10FFFF then pp ppf "U+Invalid(%X)" cp else
 | 
						|
  if cp <= 0xFFFF then pp ppf "U+%04X" cp else
 | 
						|
  pp ppf "U+%X" cp
 | 
						|
 | 
						|
let cp_to_string cp =                                    (* NOT thread safe. *)
 | 
						|
  pp Format.str_formatter "%a" pp_cp cp; Format.flush_str_formatter ()
 | 
						|
 | 
						|
(* Unicode encoding schemes *)
 | 
						|
 | 
						|
type encoding = [ `UTF_8 | `UTF_16 | `UTF_16BE | `UTF_16LE ]
 | 
						|
type decoder_encoding = [ encoding | `US_ASCII | `ISO_8859_1 ]
 | 
						|
 | 
						|
let encoding_of_string s = match String.uppercase s with      (* IANA names. *)
 | 
						|
| "UTF-8" -> Some `UTF_8
 | 
						|
| "UTF-16" -> Some `UTF_16
 | 
						|
| "UTF-16LE" -> Some `UTF_16LE
 | 
						|
| "UTF-16BE" -> Some `UTF_16BE
 | 
						|
| "ANSI_X3.4-1968" | "ISO-IR-6" | "ANSI_X3.4-1986" | "ISO_646.IRV:1991"
 | 
						|
| "ASCII" | "ISO646-US" | "US-ASCII" | "US" | "IBM367" | "CP367" | "CSASCII" ->
 | 
						|
    Some `US_ASCII
 | 
						|
| "ISO_8859-1:1987" | "ISO-IR-100" | "ISO_8859-1" | "ISO-8859-1"
 | 
						|
| "LATIN1" | "L1" | "IBM819" | "CP819" | "CSISOLATIN1" ->
 | 
						|
    Some `ISO_8859_1
 | 
						|
| _ -> None
 | 
						|
 | 
						|
let encoding_to_string = function
 | 
						|
| `UTF_8 -> "UTF-8" | `UTF_16 -> "UTF-16" | `UTF_16BE -> "UTF-16BE"
 | 
						|
| `UTF_16LE -> "UTF-16LE" | `US_ASCII -> "US-ASCII"
 | 
						|
| `ISO_8859_1 -> "ISO-8859-1"
 | 
						|
 | 
						|
(* Base character decoders. They assume enough data. *)
 | 
						|
 | 
						|
let malformed s j l = `Malformed (String.sub s j l)
 | 
						|
let malformed_pair be hi s j l =    (* missing or half low surrogate at eoi. *)
 | 
						|
  let bs1 = String.sub s j l in
 | 
						|
  let bs0 = String.create 2 in
 | 
						|
  let j0, j1 = if be then (0, 1) else (1, 0) in
 | 
						|
  unsafe_set_byte bs0 j0 (hi lsr 8);
 | 
						|
  unsafe_set_byte bs0 j1 (hi land 0xFF);
 | 
						|
  `Malformed (bs0 ^ bs1)
 | 
						|
 | 
						|
let r_us_ascii s j =
 | 
						|
  (* assert (0 <= j && j < String.length s); *)
 | 
						|
  let b0 = unsafe_byte s j in
 | 
						|
  if b0 <= 127 then `Uchar b0 else malformed s j 1
 | 
						|
 | 
						|
let r_iso_8859_1 s j =
 | 
						|
  (* assert (0 <= j && j < String.length s); *)
 | 
						|
  `Uchar (unsafe_byte s j)
 | 
						|
 | 
						|
let utf_8_len = [| (* uchar byte length according to first UTF-8 byte. *)
 | 
						|
  1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
 | 
						|
  1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
 | 
						|
  1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
 | 
						|
  1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
 | 
						|
  1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
 | 
						|
  1; 1; 1; 1; 1; 1; 1; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
 | 
						|
  0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
 | 
						|
  0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
 | 
						|
  0; 0; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2;
 | 
						|
  2; 2; 2; 2; 2; 2; 2; 2; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3;
 | 
						|
  4; 4; 4; 4; 4; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0 |]
 | 
						|
 | 
						|
let r_utf_8 s j l =
 | 
						|
  (* assert (0 <= j && 0 <= l && j + l <= String.length s); *)
 | 
						|
  match l with
 | 
						|
  | 1 -> `Uchar (unsafe_byte s j)
 | 
						|
  | 2 ->
 | 
						|
      let b0 = unsafe_byte s j in let b1 = unsafe_byte s (j + 1) in
 | 
						|
      if b1 lsr 6 != 0b10 then malformed s j l else
 | 
						|
      `Uchar (((b0 land 0x1F) lsl 6) lor (b1 land 0x3F))
 | 
						|
  | 3 ->
 | 
						|
      let b0 = unsafe_byte s j in let b1 = unsafe_byte s (j + 1) in
 | 
						|
      let b2 = unsafe_byte s (j + 2) in
 | 
						|
      let c = `Uchar (((b0 land 0x0F) lsl 12) lor
 | 
						|
                      ((b1 land 0x3F) lsl 6) lor
 | 
						|
          (b2 land 0x3F))
 | 
						|
      in
 | 
						|
      if b2 lsr 6 != 0b10 then malformed s j l else
 | 
						|
      begin match b0 with
 | 
						|
      | 0xE0 -> if b1 < 0xA0 || 0xBF < b1 then malformed s j l else c
 | 
						|
      | 0xED -> if b1 < 0x80 || 0x9F < b1 then malformed s j l else c
 | 
						|
      | _ -> if b1 lsr 6 != 0b10 then malformed s j l else c
 | 
						|
      end
 | 
						|
  | 4 ->
 | 
						|
      let b0 = unsafe_byte s j in let b1 = unsafe_byte s (j + 1) in
 | 
						|
      let b2 = unsafe_byte s (j + 2) in let b3 = unsafe_byte s (j + 3) in
 | 
						|
      let c = `Uchar (((b0 land 0x07) lsl 18) lor
 | 
						|
                      ((b1 land 0x3F) lsl 12) lor
 | 
						|
          ((b2 land 0x3F) lsl 6) lor
 | 
						|
                      (b3 land 0x3F))
 | 
						|
      in
 | 
						|
      if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10 then malformed s j l else
 | 
						|
      begin match b0 with
 | 
						|
      | 0xF0 -> if b1 < 0x90 || 0xBF < b1 then malformed s j l else c
 | 
						|
      | 0xF4 -> if b1 < 0x80 || 0x8F < b1 then malformed s j l else c
 | 
						|
      | _ -> if b1 lsr 6 != 0b10 then malformed s j l else c
 | 
						|
      end
 | 
						|
  | _ -> assert false
 | 
						|
 | 
						|
let r_utf_16 s j0 j1 =                       (* May return a high surrogate. *)
 | 
						|
  (* assert (0 <= j0 && 0 <= j1 && max j0 j1 < String.length s); *)
 | 
						|
  let b0 = unsafe_byte s j0 in let b1 = unsafe_byte s j1 in
 | 
						|
  let u = (b0 lsl 8) lor b1 in
 | 
						|
  if u < 0xD800 || u > 0xDFFF then `Uchar u else
 | 
						|
  if u > 0xDBFF then malformed s (min j0 j1) 2 else `Hi u
 | 
						|
 | 
						|
let r_utf_16_lo hi s j0 j1 =          (* Combines [hi] with a low surrogate. *)
 | 
						|
  (* assert (0 <= j0 && 0 <= j1 && max j0 j1 < String.length s); *)
 | 
						|
  let b0 = unsafe_byte s j0 in
 | 
						|
  let b1 = unsafe_byte s j1 in
 | 
						|
  let lo = (b0 lsl 8) lor b1 in
 | 
						|
  if lo < 0xDC00 || lo > 0xDFFF
 | 
						|
  then malformed_pair (j0 < j1 (* true => be *)) hi s (min j0 j1) 2
 | 
						|
  else `Uchar ((((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000)
 | 
						|
 | 
						|
let r_encoding s j l =                  (* guess encoding with max. 3 bytes. *)
 | 
						|
  (* assert (0 <= j && 0 <= l && j + l <= String.length s) *)
 | 
						|
  let some i = if i < l then Some (unsafe_byte s (j + i)) else None in
 | 
						|
  match (some 0), (some 1), (some 2) with
 | 
						|
  | Some 0xEF, Some 0xBB, Some 0xBF                 -> `UTF_8 `BOM
 | 
						|
  | Some 0xFE, Some 0xFF, _                         -> `UTF_16BE `BOM
 | 
						|
  | Some 0xFF, Some 0xFE, _                         -> `UTF_16LE `BOM
 | 
						|
  | Some 0x00, Some    p, _              when p > 0 -> `UTF_16BE (`ASCII p)
 | 
						|
  | Some    p, Some 0x00, _              when p > 0 -> `UTF_16LE (`ASCII p)
 | 
						|
  | Some    u,         _, _ when utf_8_len.(u) <> 0 -> `UTF_8 `Decode
 | 
						|
  | Some    _, Some    _, _                         -> `UTF_16BE `Decode
 | 
						|
  | Some    _, None     , None                      -> `UTF_8 `Decode
 | 
						|
  | None     , None     , None                      -> `UTF_8 `End
 | 
						|
  | None     , Some    _, _                         -> assert false
 | 
						|
  | Some    _, None     , Some _                    -> assert false
 | 
						|
  | None     , None     , Some _                    -> assert false
 | 
						|
 | 
						|
(* Decode *)
 | 
						|
 | 
						|
type src = [ `Channel of in_channel | `String of string | `Manual ]
 | 
						|
type nln = [ `ASCII of uchar | `NLF of uchar | `Readline of uchar ]
 | 
						|
type decode = [ `Await | `End | `Malformed of string | `Uchar of uchar]
 | 
						|
 | 
						|
let pp_decode ppf = function
 | 
						|
| `Uchar u -> pp ppf "@[`Uchar %a@]" pp_cp u
 | 
						|
| `End -> pp ppf "`End"
 | 
						|
| `Await -> pp ppf "`Await"
 | 
						|
| `Malformed bs ->
 | 
						|
    let l = String.length bs in
 | 
						|
    pp ppf "@[`Malformed (";
 | 
						|
    if l > 0 then pp ppf "%02X" (Char.code (bs.[0]));
 | 
						|
    for i = 1 to l - 1 do pp ppf " %02X" (Char.code (bs.[i])) done;
 | 
						|
    pp ppf ")@]"
 | 
						|
 | 
						|
type decoder =
 | 
						|
  { src : src;                                              (* input source. *)
 | 
						|
    mutable encoding : decoder_encoding;                (* decoded encoding. *)
 | 
						|
    nln : nln option;                     (* newline normalization (if any). *)
 | 
						|
    nl : int;                            (* newline normalization character. *)
 | 
						|
    mutable i : string;                              (* current input chunk. *)
 | 
						|
    mutable i_pos : int;                          (* input current position. *)
 | 
						|
    mutable i_max : int;                          (* input maximal position. *)
 | 
						|
    t : string;        (* four bytes temporary buffer for overlapping reads. *)
 | 
						|
    mutable t_len : int;                      (* current byte length of [t]. *)
 | 
						|
    mutable t_need : int;                  (* number of bytes needed in [t]. *)
 | 
						|
    mutable removed_bom : bool;     (* [true] if an initial BOM was removed. *)
 | 
						|
    mutable last_cr : bool;                   (* [true] if last char was CR. *)
 | 
						|
    mutable line : int;                                      (* line number. *)
 | 
						|
    mutable col : int;                                     (* column number. *)
 | 
						|
    mutable byte_count : int;                                 (* byte count. *)
 | 
						|
    mutable count : int;                                      (* char count. *)
 | 
						|
    mutable pp :        (* decoder post-processor for BOM, position and nln. *)
 | 
						|
      decoder -> [ `Malformed of string | `Uchar of uchar ] -> decode;
 | 
						|
    mutable k : decoder -> decode }                 (* decoder continuation. *)
 | 
						|
 | 
						|
(* On decodes that overlap two (or more) [d.i] buffers, we use [t_fill] to copy
 | 
						|
   the input data to [d.t] and decode from there. If the [d.i] buffers are not
 | 
						|
   too small this is faster than continuation based byte per byte writes.
 | 
						|
 | 
						|
   End of input (eoi) is signalled by [d.i_pos = 0] and [d.i_max = min_int]
 | 
						|
   which implies that [i_rem d < 0] is [true]. *)
 | 
						|
 | 
						|
let i_rem d = d.i_max - d.i_pos + 1     (* remaining bytes to read in [d.i]. *)
 | 
						|
let eoi d = d.i <- ""; d.i_pos <- 0; d.i_max <- min_int   (* set eoi in [d]. *)
 | 
						|
let src d s j l =                                     (* set [d.i] with [s]. *)
 | 
						|
  if (j < 0 || l < 0 || j + l > String.length s) then invalid_bounds j l else
 | 
						|
  if (l = 0) then eoi d else
 | 
						|
  (d.i <- s; d.i_pos <- j; d.i_max <- j + l - 1)
 | 
						|
 | 
						|
let refill k d = match d.src with  (* get new input in [d.i] and [k]ontinue. *)
 | 
						|
| `Manual -> d.k <- k; `Await
 | 
						|
| `String _ -> eoi d; k d
 | 
						|
| `Channel ic ->
 | 
						|
    let rc = input ic d.i 0 (String.length d.i) in
 | 
						|
    (src d d.i 0 rc; k d)
 | 
						|
 | 
						|
let t_need d need = d.t_len <- 0; d.t_need <- need
 | 
						|
let rec t_fill k d =      (* get [d.t_need] bytes (or less if eoi) in [i.t]. *)
 | 
						|
  let blit d l =
 | 
						|
    unsafe_blit d.i d.i_pos d.t d.t_len (* write pos. *) l;
 | 
						|
    d.i_pos <- d.i_pos + l; d.t_len <- d.t_len + l;
 | 
						|
  in
 | 
						|
  let rem = i_rem d in
 | 
						|
  if rem < 0 (* eoi *) then k d else
 | 
						|
  let need = d.t_need - d.t_len in
 | 
						|
  if rem < need then (blit d rem; refill (t_fill k) d) else (blit d need; k d)
 | 
						|
 | 
						|
let ret k v byte_count d =                     (* return post-processed [v]. *)
 | 
						|
  d.k <- k; d.byte_count <- d.byte_count + byte_count; d.pp d v
 | 
						|
 | 
						|
(* Decoders. *)
 | 
						|
 | 
						|
let rec decode_us_ascii d =
 | 
						|
  let rem = i_rem d in
 | 
						|
  if rem <= 0 then (if rem < 0 then `End else refill decode_us_ascii d) else
 | 
						|
  let j = d.i_pos in
 | 
						|
  d.i_pos <- d.i_pos + 1; ret decode_us_ascii (r_us_ascii d.i j) 1 d
 | 
						|
 | 
						|
let rec decode_iso_8859_1 d =
 | 
						|
  let rem = i_rem d in
 | 
						|
  if rem <= 0 then (if rem < 0 then `End else refill decode_iso_8859_1 d) else
 | 
						|
  let j = d.i_pos in
 | 
						|
  d.i_pos <- d.i_pos + 1; ret decode_iso_8859_1 (r_iso_8859_1 d.i j) 1 d
 | 
						|
 | 
						|
(* UTF-8 decoder *)
 | 
						|
 | 
						|
let rec t_decode_utf_8 d =                             (* decode from [d.t]. *)
 | 
						|
  if d.t_len < d.t_need
 | 
						|
  then ret decode_utf_8 (malformed d.t 0 d.t_len) d.t_len d
 | 
						|
  else ret decode_utf_8 (r_utf_8 d.t 0 d.t_len) d.t_len d
 | 
						|
 | 
						|
and decode_utf_8 d =
 | 
						|
  let rem = i_rem d in
 | 
						|
  if rem <= 0 then (if rem < 0 then `End else refill decode_utf_8 d) else
 | 
						|
  let need = unsafe_array_get utf_8_len (unsafe_byte d.i d.i_pos) in
 | 
						|
  if rem < need then (t_need d need; t_fill t_decode_utf_8 d) else
 | 
						|
  let j = d.i_pos in
 | 
						|
  if need = 0
 | 
						|
  then (d.i_pos <- d.i_pos + 1; ret decode_utf_8 (malformed d.i j 1) 1 d)
 | 
						|
  else (d.i_pos <- d.i_pos + need; ret decode_utf_8 (r_utf_8 d.i j need) need d)
 | 
						|
 | 
						|
(* UTF-16BE decoder *)
 | 
						|
 | 
						|
let rec t_decode_utf_16be_lo hi d =                    (* decode from [d.t]. *)
 | 
						|
  let bcount = d.t_len + 2 (* hi count *) in
 | 
						|
  if d.t_len < d.t_need
 | 
						|
  then ret decode_utf_16be (malformed_pair true hi d.t 0 d.t_len) bcount d
 | 
						|
  else ret decode_utf_16be (r_utf_16_lo hi d.t 0 1) bcount d
 | 
						|
 | 
						|
and t_decode_utf_16be d =                              (* decode from [d.t]. *)
 | 
						|
  if d.t_len < d.t_need
 | 
						|
  then ret decode_utf_16be (malformed d.t 0 d.t_len) d.t_len d
 | 
						|
  else decode_utf_16be_lo (r_utf_16 d.t 0 1) d
 | 
						|
 | 
						|
and decode_utf_16be_lo v d = match v with
 | 
						|
| `Uchar _ | `Malformed _ as v -> ret decode_utf_16be v 2 d
 | 
						|
| `Hi hi ->
 | 
						|
    let rem = i_rem d in
 | 
						|
    if rem < 2 then (t_need d 2; t_fill (t_decode_utf_16be_lo hi) d) else
 | 
						|
    let j = d.i_pos in
 | 
						|
    d.i_pos <- d.i_pos + 2;
 | 
						|
    ret decode_utf_16be (r_utf_16_lo hi d.i j (j + 1)) 4 d
 | 
						|
 | 
						|
and decode_utf_16be d =
 | 
						|
  let rem = i_rem d in
 | 
						|
  if rem <= 0 then (if rem < 0 then `End else refill decode_utf_16be d) else
 | 
						|
  if rem < 2 then (t_need d 2; t_fill t_decode_utf_16be d) else
 | 
						|
  let j = d.i_pos in
 | 
						|
  d.i_pos <- d.i_pos + 2; decode_utf_16be_lo (r_utf_16 d.i j (j + 1)) d
 | 
						|
 | 
						|
(* UTF-16LE decoder, same as UTF-16BE with byte swapped. *)
 | 
						|
 | 
						|
let rec t_decode_utf_16le_lo hi d =                    (* decode from [d.t]. *)
 | 
						|
  let bcount = d.t_len + 2 (* hi count *) in
 | 
						|
  if d.t_len < d.t_need
 | 
						|
  then ret decode_utf_16le (malformed_pair false hi d.t 0 d.t_len) bcount d
 | 
						|
  else ret decode_utf_16le (r_utf_16_lo hi d.t 1 0) bcount d
 | 
						|
 | 
						|
and t_decode_utf_16le d =                              (* decode from [d.t]. *)
 | 
						|
  if d.t_len < d.t_need
 | 
						|
  then ret decode_utf_16le (malformed d.t 0 d.t_len) d.t_len d
 | 
						|
  else decode_utf_16le_lo (r_utf_16 d.t 1 0) d
 | 
						|
 | 
						|
and decode_utf_16le_lo v d = match v with
 | 
						|
| `Uchar _ | `Malformed _ as v -> ret decode_utf_16le v 2 d
 | 
						|
| `Hi hi ->
 | 
						|
    let rem = i_rem d in
 | 
						|
    if rem < 2 then (t_need d 2; t_fill (t_decode_utf_16le_lo hi) d) else
 | 
						|
    let j = d.i_pos in
 | 
						|
    d.i_pos <- d.i_pos + 2;
 | 
						|
    ret decode_utf_16le (r_utf_16_lo hi d.i (j + 1) j) 4 d
 | 
						|
 | 
						|
and decode_utf_16le d =
 | 
						|
  let rem = i_rem d in
 | 
						|
  if rem <= 0 then (if rem < 0 then `End else refill decode_utf_16le d) else
 | 
						|
  if rem < 2 then (t_need d 2; t_fill t_decode_utf_16le d) else
 | 
						|
  let j = d.i_pos in
 | 
						|
  d.i_pos <- d.i_pos + 2; decode_utf_16le_lo (r_utf_16 d.i (j + 1) j) d
 | 
						|
 | 
						|
(* Encoding guessing. The guess is simple but starting the decoder
 | 
						|
   after is tedious, uutf's decoders are not designed to put bytes
 | 
						|
   back in the stream. *)
 | 
						|
 | 
						|
let guessed_utf_8 d =                   (* start decoder after `UTF_8 guess. *)
 | 
						|
  let b3 d =                                 (* handles the third read byte. *)
 | 
						|
    let b3 = unsafe_byte d.t 2 in
 | 
						|
    match utf_8_len.(b3) with
 | 
						|
    | 0 -> ret decode_utf_8 (malformed d.t 2 1) 1 d
 | 
						|
    | n ->
 | 
						|
        d.t_need <- n; d.t_len <- 1; unsafe_set_byte d.t 0 b3;
 | 
						|
        t_fill t_decode_utf_8 d
 | 
						|
  in
 | 
						|
  let b2 d =                                     (* handle second read byte. *)
 | 
						|
    let b2 = unsafe_byte d.t 1 in
 | 
						|
    let b3 = if d.t_len > 2 then b3 else decode_utf_8 (* decodes `End *) in
 | 
						|
    match utf_8_len.(b2) with
 | 
						|
    | 0 -> ret b3 (malformed d.t 1 1) 1 d
 | 
						|
    | 1 -> ret b3 (r_utf_8 d.t 1 1) 1 d
 | 
						|
    | n ->                         (* copy d.t.(1-2) to d.t.(0-1) and decode *)
 | 
						|
        d.t_need <- n;
 | 
						|
        unsafe_set_byte d.t 0 b2;
 | 
						|
        if (d.t_len < 3) then d.t_len <- 1 else
 | 
						|
        (d.t_len <- 2; unsafe_set_byte d.t 1 (unsafe_byte d.t 2); );
 | 
						|
        t_fill t_decode_utf_8 d
 | 
						|
  in
 | 
						|
  let b1 = unsafe_byte d.t 0 in                   (* handle first read byte. *)
 | 
						|
  let b2 = if d.t_len > 1 then b2 else decode_utf_8 (* decodes `End *) in
 | 
						|
  match utf_8_len.(b1) with
 | 
						|
  | 0 -> ret b2 (malformed d.t 0 1) 1 d
 | 
						|
  | 1 -> ret b2 (r_utf_8 d.t 0 1) 1 d
 | 
						|
  | 2 ->
 | 
						|
      if d.t_len < 2 then ret decode_utf_8 (malformed d.t 0 1) 1 d else
 | 
						|
      if d.t_len < 3 then ret decode_utf_8 (r_utf_8 d.t 0 2) 2 d else
 | 
						|
      ret b3 (r_utf_8 d.t 0 2) 2 d
 | 
						|
  | 3 ->
 | 
						|
      if d.t_len < 3
 | 
						|
      then ret decode_utf_8 (malformed d.t 0 d.t_len) d.t_len d
 | 
						|
      else ret decode_utf_8 (r_utf_8 d.t 0 3) 3 d
 | 
						|
  | 4 ->
 | 
						|
      if d.t_len < 3
 | 
						|
      then ret decode_utf_8 (malformed d.t 0 d.t_len) d.t_len d
 | 
						|
      else (d.t_need <- 4; t_fill t_decode_utf_8 d)
 | 
						|
  | n -> assert false
 | 
						|
 | 
						|
let guessed_utf_16 d be v =     (* start decoder after `UTF_16{BE,LE} guess. *)
 | 
						|
  let decode_utf_16, t_decode_utf_16, t_decode_utf_16_lo, j0, j1 =
 | 
						|
    if be then decode_utf_16be, t_decode_utf_16be, t_decode_utf_16be_lo, 0, 1
 | 
						|
    else decode_utf_16le, t_decode_utf_16le, t_decode_utf_16le_lo, 1, 0
 | 
						|
  in
 | 
						|
  let b3 k d =
 | 
						|
    if d.t_len < 3 then decode_utf_16 d (* decodes `End *) else
 | 
						|
    begin                             (* copy d.t.(2) to d.t.(0) and decode. *)
 | 
						|
      d.t_need <- 2; d.t_len <- 1;
 | 
						|
      unsafe_set_byte d.t 0 (unsafe_byte d.t 2);
 | 
						|
      t_fill k d
 | 
						|
    end
 | 
						|
  in
 | 
						|
  match v with
 | 
						|
  | `BOM -> ret (b3 t_decode_utf_16) (`Uchar u_bom) 2 d
 | 
						|
  | `ASCII u -> ret (b3 t_decode_utf_16) (`Uchar u) 2 d
 | 
						|
  | `Decode ->
 | 
						|
      match r_utf_16 d.t j0 j1 with
 | 
						|
      | `Malformed _ | `Uchar _ as v -> ret (b3 t_decode_utf_16) v 2 d
 | 
						|
      | `Hi hi ->
 | 
						|
        if d.t_len < 3
 | 
						|
        then ret decode_utf_16 (malformed_pair be hi "" 0 0) d.t_len d
 | 
						|
        else (b3 (t_decode_utf_16_lo hi)) d
 | 
						|
 | 
						|
let guess_encoding d =                  (* guess encoding and start decoder. *)
 | 
						|
  let setup d = match r_encoding d.t 0 d.t_len with
 | 
						|
  | `UTF_8 r ->
 | 
						|
      d.encoding <- `UTF_8; d.k <- decode_utf_8;
 | 
						|
      begin match r with
 | 
						|
      | `BOM -> ret decode_utf_8 (`Uchar u_bom) 3 d
 | 
						|
      | `Decode -> guessed_utf_8 d
 | 
						|
      | `End -> `End
 | 
						|
      end
 | 
						|
  | `UTF_16BE r ->
 | 
						|
      d.encoding <- `UTF_16BE; d.k <- decode_utf_16be; guessed_utf_16 d true r
 | 
						|
  | `UTF_16LE r ->
 | 
						|
      d.encoding <- `UTF_16LE; d.k <- decode_utf_16le; guessed_utf_16 d false r
 | 
						|
 | 
						|
  in
 | 
						|
  (t_need d 3; t_fill setup d)
 | 
						|
 | 
						|
(* Character post-processors. Used for BOM handling, newline
 | 
						|
   normalization and position tracking. The [pp_remove_bom] is only
 | 
						|
   used for the first character to remove a possible initial BOM and
 | 
						|
   handle UTF-16 endianness recognition. *)
 | 
						|
 | 
						|
let nline d = d.col <- 0; d.line <- d.line + 1                   (* inlined. *)
 | 
						|
let ncol d = d.col <- d.col + 1                                  (* inlined. *)
 | 
						|
let ncount d = d.count <- d.count + 1                            (* inlined. *)
 | 
						|
let cr d b = d.last_cr <- b                                      (* inlined. *)
 | 
						|
 | 
						|
let pp_remove_bom utf16 pp d = function(* removes init. BOM, handles UTF-16. *)
 | 
						|
| `Uchar 0xFEFF (* BOM *) ->
 | 
						|
    if utf16 then (d.encoding <- `UTF_16BE; d.k <- decode_utf_16be);
 | 
						|
    d.removed_bom <- true; d.pp <- pp; d.k d
 | 
						|
| `Uchar 0xFFFE (* BOM reversed from decode_utf_16be *) when utf16 ->
 | 
						|
    d.encoding <- `UTF_16LE; d.k <- decode_utf_16le;
 | 
						|
    d.removed_bom <- true; d.pp <- pp; d.k d
 | 
						|
| `Malformed _ | `Uchar _ as v ->
 | 
						|
    d.removed_bom <- false; d.pp <- pp; d.pp d v
 | 
						|
 | 
						|
let pp_nln_none d = function
 | 
						|
| `Uchar 0x000A (* LF *) as v ->
 | 
						|
    let last_cr = d.last_cr in
 | 
						|
    cr d false; ncount d; if last_cr then v else (nline d; v)
 | 
						|
| `Uchar 0x000D (* CR *) as v -> cr d true; ncount d; nline d; v
 | 
						|
| `Uchar (0x0085 | 0x000C | 0x2028 | 0x2029) (* NEL | FF | LS | PS *) as v ->
 | 
						|
    cr d false; ncount d; nline d; v
 | 
						|
| `Uchar _ | `Malformed _ as v -> cr d false; ncount d; ncol d; v
 | 
						|
 | 
						|
let pp_nln_readline d = function
 | 
						|
| `Uchar 0x000A (* LF *) ->
 | 
						|
    let last_cr = d.last_cr in
 | 
						|
    cr d false; if last_cr then d.k d else (ncount d; nline d; `Uchar d.nl)
 | 
						|
| `Uchar 0x000D (* CR *) -> cr d true; ncount d; nline d; `Uchar d.nl
 | 
						|
| `Uchar (0x0085 | 0x000C | 0x2028 | 0x2029) (* NEL | FF | LS | PS *) ->
 | 
						|
    cr d false; ncount d; nline d; `Uchar d.nl
 | 
						|
| `Uchar _ | `Malformed _ as v -> cr d false; ncount d; ncol d; v
 | 
						|
 | 
						|
let pp_nln_nlf d = function
 | 
						|
| `Uchar 0x000A (* LF *) ->
 | 
						|
    let last_cr = d.last_cr in
 | 
						|
    cr d false; if last_cr then d.k d else (ncount d; nline d; `Uchar d.nl)
 | 
						|
| `Uchar 0x000D (* CR *) -> cr d true; ncount d; nline d; `Uchar d.nl
 | 
						|
| `Uchar 0x0085 (* NEL *) -> cr d false; ncount d; nline d; `Uchar d.nl
 | 
						|
| `Uchar (0x000C | 0x2028 | 0x2029) as v (* FF | LS | PS *) ->
 | 
						|
    cr d false; ncount d; nline d; v
 | 
						|
| `Uchar _ | `Malformed _ as v -> cr d false; ncount d; ncol d; v
 | 
						|
 | 
						|
let pp_nln_ascii d = function
 | 
						|
| `Uchar 0x000A (* LF *) ->
 | 
						|
    let last_cr = d.last_cr in
 | 
						|
    cr d false; if last_cr then d.k d else (ncount d; nline d; `Uchar d.nl)
 | 
						|
| `Uchar 0x000D (* CR *) -> cr d true; ncount d; nline d; `Uchar d.nl
 | 
						|
| `Uchar (0x0085 | 0x000C | 0x2028 | 0x2029) as v (* NEL | FF | LS | PS *) ->
 | 
						|
    cr d false; ncount d; nline d; v
 | 
						|
| `Uchar _ | `Malformed _ as v -> cr d false; ncount d; ncol d; v
 | 
						|
 | 
						|
let decode_fun = function
 | 
						|
| `UTF_8 -> decode_utf_8
 | 
						|
| `UTF_16 -> decode_utf_16be                         (* see [pp_remove_bom]. *)
 | 
						|
| `UTF_16BE -> decode_utf_16be
 | 
						|
| `UTF_16LE -> decode_utf_16le
 | 
						|
| `US_ASCII -> decode_us_ascii
 | 
						|
| `ISO_8859_1 -> decode_iso_8859_1
 | 
						|
 | 
						|
let decoder ?nln ?encoding src =
 | 
						|
  let pp, nl = match nln with
 | 
						|
  | None -> pp_nln_none, 0x000A (* not used. *)
 | 
						|
  | Some (`ASCII nl) -> pp_nln_ascii, nl
 | 
						|
  | Some (`NLF nl) -> pp_nln_nlf, nl
 | 
						|
  | Some (`Readline nl) -> pp_nln_readline, nl
 | 
						|
  in
 | 
						|
  let encoding, k = match encoding with
 | 
						|
  | None -> `UTF_8, guess_encoding
 | 
						|
  | Some e -> (e :> decoder_encoding), decode_fun e
 | 
						|
  in
 | 
						|
  let i, i_pos, i_max = match src with
 | 
						|
  | `Manual -> "", 1, 0                            (* implies src_rem d = 0. *)
 | 
						|
  | `Channel _ -> String.create io_buffer_size, 1, 0                (* idem. *)
 | 
						|
  | `String s -> s, 0, String.length s - 1
 | 
						|
  in
 | 
						|
  { src = (src :> src); encoding; nln = (nln :> nln option); nl;
 | 
						|
    i; i_pos; i_max; t = String.create 4; t_len = 0; t_need = 0;
 | 
						|
    removed_bom = false; last_cr = false; line = 1; col = 0;
 | 
						|
    byte_count = 0; count = 0;
 | 
						|
    pp = pp_remove_bom (encoding = `UTF_16) pp; k }
 | 
						|
 | 
						|
let decode d = d.k d
 | 
						|
let decoder_line d = d.line
 | 
						|
let decoder_col d = d.col
 | 
						|
let decoder_byte_count d = d.byte_count
 | 
						|
let decoder_count d = d.count
 | 
						|
let decoder_removed_bom d = d.removed_bom
 | 
						|
let decoder_src d = d.src
 | 
						|
let decoder_nln d = d.nln
 | 
						|
let decoder_encoding d = d.encoding
 | 
						|
let set_decoder_encoding d e =
 | 
						|
  d.encoding <- (e :> decoder_encoding); d.k <- decode_fun e
 | 
						|
 | 
						|
(* Encode *)
 | 
						|
 | 
						|
type dst = [ `Channel of out_channel | `Buffer of Buffer.t | `Manual ]
 | 
						|
type encode = [ `Await | `End | `Uchar of uchar ]
 | 
						|
type encoder =
 | 
						|
  { dst : dst;                                        (* output destination. *)
 | 
						|
    encoding : encoding;                                (* encoded encoding. *)
 | 
						|
    mutable o : string;                             (* current output chunk. *)
 | 
						|
    mutable o_pos : int;                   (* next output position to write. *)
 | 
						|
    mutable o_max : int;                (* maximal output position to write. *)
 | 
						|
    t : string;                 (* four bytes buffer for overlapping writes. *)
 | 
						|
    mutable t_pos : int;                    (* next position to read in [t]. *)
 | 
						|
    mutable t_max : int;                 (* maximal position to read in [t]. *)
 | 
						|
    mutable k :                                     (* encoder continuation. *)
 | 
						|
      encoder -> encode -> [ `Ok | `Partial ] }
 | 
						|
 | 
						|
(* On encodes that overlap two (or more) [e.o] buffers, we encode the
 | 
						|
   character to the temporary buffer [o.t] and continue with
 | 
						|
   [tmp_flush] to write this data on the different [e.o] buffers. If
 | 
						|
   the [e.o] buffers are not too small this is faster than
 | 
						|
   continuation based byte per byte writes. *)
 | 
						|
 | 
						|
let o_rem e = e.o_max - e.o_pos + 1    (* remaining bytes to write in [e.o]. *)
 | 
						|
let dst e s j l =                                     (* set [e.o] with [s]. *)
 | 
						|
  if (j < 0 || l < 0 || j + l > String.length s) then invalid_bounds j l;
 | 
						|
  e.o <- s; e.o_pos <- j; e.o_max <- j + l - 1
 | 
						|
 | 
						|
let partial k e = function `Await -> k e | `Uchar _ | `End -> invalid_encode ()
 | 
						|
let flush k e = match e.dst with(* get free storage in [d.o] and [k]ontinue. *)
 | 
						|
| `Manual -> e.k <- partial k; `Partial
 | 
						|
| `Buffer b -> Buffer.add_substring b e.o 0 e.o_pos; e.o_pos <- 0; k e
 | 
						|
| `Channel oc -> output oc e.o 0 e.o_pos; e.o_pos <- 0; k e
 | 
						|
 | 
						|
let t_range e max = e.t_pos <- 0; e.t_max <- max
 | 
						|
let rec t_flush k e =               (* flush [d.t] up to [d.t_max] in [d.i]. *)
 | 
						|
  let blit e l =
 | 
						|
    unsafe_blit e.t e.t_pos e.o e.o_pos l;
 | 
						|
    e.o_pos <- e.o_pos + l; e.t_pos <- e.t_pos + l
 | 
						|
  in
 | 
						|
  let rem = o_rem e in
 | 
						|
  let len = e.t_max - e.t_pos + 1 in
 | 
						|
  if rem < len then (blit e rem; flush (t_flush k) e) else (blit e len; k e)
 | 
						|
 | 
						|
(* Encoders. *)
 | 
						|
 | 
						|
let rec encode_utf_8 e v =
 | 
						|
  let k e = e.k <- encode_utf_8; `Ok in
 | 
						|
  match v with
 | 
						|
  | `Await -> k e
 | 
						|
  | `End -> flush k e
 | 
						|
  | `Uchar u as v ->
 | 
						|
      let rem = o_rem e in
 | 
						|
      if u <= 0x007F then
 | 
						|
      if rem < 1 then flush (fun e -> encode_utf_8 e v) e else
 | 
						|
      (unsafe_set_byte e.o e.o_pos u; e.o_pos <- e.o_pos + 1; k e)
 | 
						|
      else if u <= 0x07FF then
 | 
						|
      begin
 | 
						|
        let s, j, k =
 | 
						|
          if rem < 2 then (t_range e 1; e.t, 0, t_flush k) else
 | 
						|
          let j = e.o_pos in (e.o_pos <- e.o_pos + 2; e.o, j, k)
 | 
						|
        in
 | 
						|
        unsafe_set_byte s j (0xC0 lor (u lsr 6));
 | 
						|
        unsafe_set_byte s (j + 1) (0x80 lor (u land 0x3F));
 | 
						|
        k e
 | 
						|
      end
 | 
						|
      else if u <= 0xFFFF then
 | 
						|
      begin
 | 
						|
        let s, j, k =
 | 
						|
          if rem < 3 then (t_range e 2; e.t, 0, t_flush k) else
 | 
						|
          let j = e.o_pos in (e.o_pos <- e.o_pos + 3; e.o, j, k)
 | 
						|
        in
 | 
						|
        unsafe_set_byte s j (0xE0 lor (u lsr 12));
 | 
						|
        unsafe_set_byte s (j + 1) (0x80 lor ((u lsr 6) land 0x3F));
 | 
						|
        unsafe_set_byte s (j + 2) (0x80 lor (u land 0x3F));
 | 
						|
        k e
 | 
						|
      end
 | 
						|
      else
 | 
						|
      begin
 | 
						|
        let s, j, k =
 | 
						|
          if rem < 4 then (t_range e 3; e.t, 0, t_flush k) else
 | 
						|
          let j = e.o_pos in (e.o_pos <- e.o_pos + 4; e.o, j, k)
 | 
						|
        in
 | 
						|
        unsafe_set_byte s j (0xF0 lor (u lsr 18));
 | 
						|
        unsafe_set_byte s (j + 1) (0x80 lor ((u lsr 12) land 0x3F));
 | 
						|
        unsafe_set_byte s (j + 2) (0x80 lor ((u lsr 6) land 0x3F));
 | 
						|
        unsafe_set_byte s (j + 3) (0x80 lor (u land 0x3F));
 | 
						|
        k e
 | 
						|
      end
 | 
						|
 | 
						|
let rec encode_utf_16be e v =
 | 
						|
  let k e = e.k <- encode_utf_16be; `Ok in
 | 
						|
  match v with
 | 
						|
  | `Await -> k e
 | 
						|
  | `End -> flush k e
 | 
						|
  | `Uchar u ->
 | 
						|
      let rem = o_rem e in
 | 
						|
      if u < 0x10000 then
 | 
						|
      begin
 | 
						|
        let s, j, k =
 | 
						|
          if rem < 2 then (t_range e 1; e.t, 0, t_flush k) else
 | 
						|
          let j = e.o_pos in (e.o_pos <- e.o_pos + 2; e.o, j, k)
 | 
						|
        in
 | 
						|
        unsafe_set_byte s j (u lsr 8);
 | 
						|
        unsafe_set_byte s (j + 1) (u land 0xFF);
 | 
						|
        k e
 | 
						|
      end else begin
 | 
						|
        let s, j, k =
 | 
						|
          if rem < 4 then (t_range e 3; e.t, 0, t_flush k) else
 | 
						|
          let j = e.o_pos in (e.o_pos <- e.o_pos + 4; e.o, j, k)
 | 
						|
        in
 | 
						|
        let u' = u - 0x10000 in
 | 
						|
        let hi = (0xD800 lor (u' lsr 10)) in
 | 
						|
        let lo = (0xDC00 lor (u' land 0x3FF)) in
 | 
						|
        unsafe_set_byte s j (hi lsr 8);
 | 
						|
        unsafe_set_byte s (j + 1) (hi land 0xFF);
 | 
						|
        unsafe_set_byte s (j + 2) (lo lsr 8);
 | 
						|
        unsafe_set_byte s (j + 3) (lo land 0xFF);
 | 
						|
        k e
 | 
						|
      end
 | 
						|
 | 
						|
let rec encode_utf_16le e v =         (* encode_uft_16be with bytes swapped. *)
 | 
						|
  let k e = e.k <- encode_utf_16le; `Ok in
 | 
						|
  match v with
 | 
						|
  | `Await -> k e
 | 
						|
  | `End -> flush k e
 | 
						|
  | `Uchar u ->
 | 
						|
    let rem = o_rem e in
 | 
						|
    if u < 0x10000 then
 | 
						|
    begin
 | 
						|
      let s, j, k =
 | 
						|
        if rem < 2 then (t_range e 1; e.t, 0, t_flush k) else
 | 
						|
        let j = e.o_pos in (e.o_pos <- e.o_pos + 2; e.o, j, k)
 | 
						|
      in
 | 
						|
      unsafe_set_byte s j (u land 0xFF);
 | 
						|
      unsafe_set_byte s (j + 1) (u lsr 8);
 | 
						|
      k e
 | 
						|
    end
 | 
						|
    else
 | 
						|
    begin
 | 
						|
      let s, j, k =
 | 
						|
        if rem < 4 then (t_range e 3; e.t, 0, t_flush k) else
 | 
						|
        let j = e.o_pos in (e.o_pos <- e.o_pos + 4; e.o, j, k)
 | 
						|
      in
 | 
						|
      let u' = u - 0x10000 in
 | 
						|
      let hi = (0xD800 lor (u' lsr 10)) in
 | 
						|
      let lo = (0xDC00 lor (u' land 0x3FF)) in
 | 
						|
      unsafe_set_byte s j (hi land 0xFF);
 | 
						|
      unsafe_set_byte s (j + 1) (hi lsr 8);
 | 
						|
      unsafe_set_byte s (j + 2) (lo land 0xFF);
 | 
						|
      unsafe_set_byte s (j + 3) (lo lsr 8);
 | 
						|
      k e
 | 
						|
    end
 | 
						|
 | 
						|
let encode_fun = function
 | 
						|
| `UTF_8 -> encode_utf_8
 | 
						|
| `UTF_16 -> encode_utf_16be
 | 
						|
| `UTF_16BE -> encode_utf_16be
 | 
						|
| `UTF_16LE -> encode_utf_16le
 | 
						|
 | 
						|
let encoder encoding dst =
 | 
						|
  let o, o_pos, o_max = match dst with
 | 
						|
  | `Manual -> "", 1, 0                              (* implies o_rem e = 0. *)
 | 
						|
  | `Buffer _
 | 
						|
  | `Channel _ -> String.create io_buffer_size, 0, io_buffer_size - 1
 | 
						|
  in
 | 
						|
  { dst = (dst :> dst); encoding = (encoding :> encoding); o; o_pos; o_max;
 | 
						|
    t = String.create 4; t_pos = 1; t_max = 0; k = encode_fun encoding}
 | 
						|
 | 
						|
let encode e v = e.k e (v :> encode)
 | 
						|
let encoder_encoding e = e.encoding
 | 
						|
let encoder_dst e = e.dst
 | 
						|
 | 
						|
(* Manual sources and destinations. *)
 | 
						|
 | 
						|
module Manual = struct
 | 
						|
  let src = src
 | 
						|
  let dst = dst
 | 
						|
  let dst_rem = o_rem
 | 
						|
end
 | 
						|
 | 
						|
(* Strings folders and Buffer encoders *)
 | 
						|
 | 
						|
module String = struct
 | 
						|
  let encoding_guess s = match r_encoding s 0 (max (String.length s) 3) with
 | 
						|
  | `UTF_8 d -> `UTF_8, (d = `BOM)
 | 
						|
  | `UTF_16BE d -> `UTF_16BE, (d = `BOM)
 | 
						|
  | `UTF_16LE d -> `UTF_16LE, (d = `BOM)
 | 
						|
 | 
						|
  type 'a folder =
 | 
						|
    'a -> int -> [ `Uchar of uchar | `Malformed of string ] -> 'a
 | 
						|
 | 
						|
  let fold_utf_8 f acc s =
 | 
						|
    let rec loop acc f s i l =
 | 
						|
      if i = l then acc else
 | 
						|
      let need = unsafe_array_get utf_8_len (unsafe_byte s i) in
 | 
						|
      if need = 0 then loop (f acc i (malformed s i 1)) f s (i + 1) l else
 | 
						|
      let rem = l - i in
 | 
						|
      if rem < need then f acc i (malformed s i rem) else
 | 
						|
      loop (f acc i (r_utf_8 s i need)) f s (i + need) l
 | 
						|
    in
 | 
						|
    loop acc f s 0 (String.length s)
 | 
						|
 | 
						|
  let fold_utf_16be f acc s =
 | 
						|
    let rec loop acc f s i l =
 | 
						|
      if i = l then acc else
 | 
						|
      let rem = l - i in
 | 
						|
      if rem < 2 then f acc i (malformed s i 1) else
 | 
						|
      match r_utf_16 s i (i + 1) with
 | 
						|
      | `Uchar _ | `Malformed _ as v -> loop (f acc i v) f s (i + 2) l
 | 
						|
      | `Hi hi ->
 | 
						|
          if rem < 4 then f acc i (malformed s i rem)  else
 | 
						|
          loop (f acc i (r_utf_16_lo hi s (i + 2) (i + 3))) f s (i + 4) l
 | 
						|
    in
 | 
						|
    loop acc f s 0 (String.length s)
 | 
						|
 | 
						|
  let fold_utf_16le f acc s =             (* [fold_utf_16be], bytes swapped. *)
 | 
						|
    let rec loop acc f s i l =
 | 
						|
      if i = l then acc else
 | 
						|
      let rem = l - i in
 | 
						|
      if rem < 2 then f acc i (malformed s i 1) else
 | 
						|
      match r_utf_16 s (i + 1) i with
 | 
						|
      | `Uchar _ | `Malformed _ as v -> loop (f acc i v) f s (i + 2) l
 | 
						|
      | `Hi hi ->
 | 
						|
          if rem < 4 then f acc i (malformed s i rem)  else
 | 
						|
          loop (f acc i (r_utf_16_lo hi s (i + 3) (i + 2))) f s (i + 4) l
 | 
						|
    in
 | 
						|
    loop acc f s 0 (String.length s)
 | 
						|
end
 | 
						|
 | 
						|
module Buffer = struct
 | 
						|
  let add_utf_8 b u =
 | 
						|
    let w byte = Buffer.add_char b (unsafe_chr byte) in          (* inlined. *)
 | 
						|
    if u <= 0x007F then
 | 
						|
    (w u)
 | 
						|
    else if u <= 0x07FF then
 | 
						|
    (w (0xC0 lor (u lsr 6));
 | 
						|
     w (0x80 lor (u land 0x3F)))
 | 
						|
    else if u <= 0xFFFF then
 | 
						|
    (w (0xE0 lor (u lsr 12));
 | 
						|
     w (0x80 lor ((u lsr 6) land 0x3F));
 | 
						|
     w (0x80 lor (u land 0x3F)))
 | 
						|
    else
 | 
						|
    (w (0xF0 lor (u lsr 18));
 | 
						|
     w (0x80 lor ((u lsr 12) land 0x3F));
 | 
						|
     w (0x80 lor ((u lsr 6) land 0x3F));
 | 
						|
     w (0x80 lor (u land 0x3F)))
 | 
						|
 | 
						|
  let add_utf_16be b u =
 | 
						|
    let w byte = Buffer.add_char b (unsafe_chr byte) in          (* inlined. *)
 | 
						|
    if u < 0x10000 then (w (u lsr 8); w (u land 0xFF)) else
 | 
						|
    let u' = u - 0x10000 in
 | 
						|
    let hi = (0xD800 lor (u' lsr 10)) in
 | 
						|
    let lo = (0xDC00 lor (u' land 0x3FF)) in
 | 
						|
    w (hi lsr 8); w (hi land 0xFF);
 | 
						|
    w (lo lsr 8); w (lo land 0xFF)
 | 
						|
 | 
						|
  let add_utf_16le b u =                            (* swapped add_utf_16be. *)
 | 
						|
    let w byte = Buffer.add_char b (unsafe_chr byte) in          (* inlined. *)
 | 
						|
    if u < 0x10000 then (w (u land 0xFF); w (u lsr 8)) else
 | 
						|
    let u' = u - 0x10000 in
 | 
						|
    let hi = (0xD800 lor (u' lsr 10)) in
 | 
						|
    let lo = (0xDC00 lor (u' land 0x3FF)) in
 | 
						|
    w (hi land 0xFF); w (hi lsr 8);
 | 
						|
    w (lo land 0xFF); w (lo lsr 8)
 | 
						|
end
 | 
						|
 | 
						|
(*---------------------------------------------------------------------------
 | 
						|
   Copyright 2012 Daniel C. Bünzli
 | 
						|
   All rights reserved.
 | 
						|
 | 
						|
   Redistribution and use in source and binary forms, with or without
 | 
						|
   modification, are permitted provided that the following conditions
 | 
						|
   are met:
 | 
						|
 | 
						|
   1. Redistributions of source code must retain the above copyright
 | 
						|
      notice, this list of conditions and the following disclaimer.
 | 
						|
 | 
						|
   2. Redistributions in binary form must reproduce the above
 | 
						|
      copyright notice, this list of conditions and the following
 | 
						|
      disclaimer in the documentation and/or other materials provided
 | 
						|
      with the distribution.
 | 
						|
 | 
						|
   3. Neither the name of Daniel C. Bünzli nor the names of
 | 
						|
      contributors may be used to endorse or promote products derived
 | 
						|
      from this software without specific prior written permission.
 | 
						|
 | 
						|
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 | 
						|
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 | 
						|
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 | 
						|
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 | 
						|
   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 | 
						|
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 | 
						|
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 | 
						|
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 | 
						|
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 | 
						|
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 | 
						|
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
						|
  ---------------------------------------------------------------------------*)
 |