(*--------------------------------------------------------------------------- Copyright 2012 Daniel C. Bünzli. All rights reserved. Distributed under the BSD3 license, see license at the end of the file. %%NAME%% release %%VERSION%% ---------------------------------------------------------------------------*) let io_buffer_size = 65536 (* IO_BUFFER_SIZE 4.0.0 *) let pp = Format.fprintf let invalid_encode () = invalid_arg "expected `Await encode" let invalid_bounds j l = invalid_arg (Printf.sprintf "invalid bounds (index %d, length %d)" j l) (* Unsafe string byte manipulations. If you don't believe the author's invariants, replacing with safe versions makes everything safe in the module. He won't be upset. *) let unsafe_chr = Char.unsafe_chr let unsafe_blit = String.unsafe_blit let unsafe_array_get = Array.unsafe_get let unsafe_byte s j = Char.code (String.unsafe_get s j) let unsafe_set_byte s j byte = String.unsafe_set s j (Char.unsafe_chr byte) (* Unicode characters *) type uchar = int let u_bom = 0xFEFF (* BOM. *) let u_rep = 0xFFFD (* replacement character. *) let is_uchar cp = (0x0000 <= cp && cp <= 0xD7FF) || (0xE000 <= cp && cp <= 0x10FFFF) let pp_cp ppf cp = if cp < 0 || cp > 0x10FFFF then pp ppf "U+Invalid(%X)" cp else if cp <= 0xFFFF then pp ppf "U+%04X" cp else pp ppf "U+%X" cp let cp_to_string cp = (* NOT thread safe. *) pp Format.str_formatter "%a" pp_cp cp; Format.flush_str_formatter () (* Unicode encoding schemes *) type encoding = [ `UTF_8 | `UTF_16 | `UTF_16BE | `UTF_16LE ] type decoder_encoding = [ encoding | `US_ASCII | `ISO_8859_1 ] let encoding_of_string s = match String.uppercase s with (* IANA names. *) | "UTF-8" -> Some `UTF_8 | "UTF-16" -> Some `UTF_16 | "UTF-16LE" -> Some `UTF_16LE | "UTF-16BE" -> Some `UTF_16BE | "ANSI_X3.4-1968" | "ISO-IR-6" | "ANSI_X3.4-1986" | "ISO_646.IRV:1991" | "ASCII" | "ISO646-US" | "US-ASCII" | "US" | "IBM367" | "CP367" | "CSASCII" -> Some `US_ASCII | "ISO_8859-1:1987" | "ISO-IR-100" | "ISO_8859-1" | "ISO-8859-1" | "LATIN1" | "L1" | "IBM819" | "CP819" | "CSISOLATIN1" -> Some `ISO_8859_1 | _ -> None let encoding_to_string = function | `UTF_8 -> "UTF-8" | `UTF_16 -> "UTF-16" | `UTF_16BE -> "UTF-16BE" | `UTF_16LE -> "UTF-16LE" | `US_ASCII -> "US-ASCII" | `ISO_8859_1 -> "ISO-8859-1" (* Base character decoders. They assume enough data. *) let malformed s j l = `Malformed (String.sub s j l) let malformed_pair be hi s j l = (* missing or half low surrogate at eoi. *) let bs1 = String.sub s j l in let bs0 = String.create 2 in let j0, j1 = if be then (0, 1) else (1, 0) in unsafe_set_byte bs0 j0 (hi lsr 8); unsafe_set_byte bs0 j1 (hi land 0xFF); `Malformed (bs0 ^ bs1) let r_us_ascii s j = (* assert (0 <= j && j < String.length s); *) let b0 = unsafe_byte s j in if b0 <= 127 then `Uchar b0 else malformed s j 1 let r_iso_8859_1 s j = (* assert (0 <= j && j < String.length s); *) `Uchar (unsafe_byte s j) let utf_8_len = [| (* uchar byte length according to first UTF-8 byte. *) 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 4; 4; 4; 4; 4; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0 |] let r_utf_8 s j l = (* assert (0 <= j && 0 <= l && j + l <= String.length s); *) match l with | 1 -> `Uchar (unsafe_byte s j) | 2 -> let b0 = unsafe_byte s j in let b1 = unsafe_byte s (j + 1) in if b1 lsr 6 != 0b10 then malformed s j l else `Uchar (((b0 land 0x1F) lsl 6) lor (b1 land 0x3F)) | 3 -> let b0 = unsafe_byte s j in let b1 = unsafe_byte s (j + 1) in let b2 = unsafe_byte s (j + 2) in let c = `Uchar (((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F)) in if b2 lsr 6 != 0b10 then malformed s j l else begin match b0 with | 0xE0 -> if b1 < 0xA0 || 0xBF < b1 then malformed s j l else c | 0xED -> if b1 < 0x80 || 0x9F < b1 then malformed s j l else c | _ -> if b1 lsr 6 != 0b10 then malformed s j l else c end | 4 -> let b0 = unsafe_byte s j in let b1 = unsafe_byte s (j + 1) in let b2 = unsafe_byte s (j + 2) in let b3 = unsafe_byte s (j + 3) in let c = `Uchar (((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F)) in if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10 then malformed s j l else begin match b0 with | 0xF0 -> if b1 < 0x90 || 0xBF < b1 then malformed s j l else c | 0xF4 -> if b1 < 0x80 || 0x8F < b1 then malformed s j l else c | _ -> if b1 lsr 6 != 0b10 then malformed s j l else c end | _ -> assert false let r_utf_16 s j0 j1 = (* May return a high surrogate. *) (* assert (0 <= j0 && 0 <= j1 && max j0 j1 < String.length s); *) let b0 = unsafe_byte s j0 in let b1 = unsafe_byte s j1 in let u = (b0 lsl 8) lor b1 in if u < 0xD800 || u > 0xDFFF then `Uchar u else if u > 0xDBFF then malformed s (min j0 j1) 2 else `Hi u let r_utf_16_lo hi s j0 j1 = (* Combines [hi] with a low surrogate. *) (* assert (0 <= j0 && 0 <= j1 && max j0 j1 < String.length s); *) let b0 = unsafe_byte s j0 in let b1 = unsafe_byte s j1 in let lo = (b0 lsl 8) lor b1 in if lo < 0xDC00 || lo > 0xDFFF then malformed_pair (j0 < j1 (* true => be *)) hi s (min j0 j1) 2 else `Uchar ((((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000) let r_encoding s j l = (* guess encoding with max. 3 bytes. *) (* assert (0 <= j && 0 <= l && j + l <= String.length s) *) let some i = if i < l then Some (unsafe_byte s (j + i)) else None in match (some 0), (some 1), (some 2) with | Some 0xEF, Some 0xBB, Some 0xBF -> `UTF_8 `BOM | Some 0xFE, Some 0xFF, _ -> `UTF_16BE `BOM | Some 0xFF, Some 0xFE, _ -> `UTF_16LE `BOM | Some 0x00, Some p, _ when p > 0 -> `UTF_16BE (`ASCII p) | Some p, Some 0x00, _ when p > 0 -> `UTF_16LE (`ASCII p) | Some u, _, _ when utf_8_len.(u) <> 0 -> `UTF_8 `Decode | Some _, Some _, _ -> `UTF_16BE `Decode | Some _, None , None -> `UTF_8 `Decode | None , None , None -> `UTF_8 `End | None , Some _, _ -> assert false | Some _, None , Some _ -> assert false | None , None , Some _ -> assert false (* Decode *) type src = [ `Channel of in_channel | `String of string | `Manual ] type nln = [ `ASCII of uchar | `NLF of uchar | `Readline of uchar ] type decode = [ `Await | `End | `Malformed of string | `Uchar of uchar] let pp_decode ppf = function | `Uchar u -> pp ppf "@[`Uchar %a@]" pp_cp u | `End -> pp ppf "`End" | `Await -> pp ppf "`Await" | `Malformed bs -> let l = String.length bs in pp ppf "@[`Malformed ("; if l > 0 then pp ppf "%02X" (Char.code (bs.[0])); for i = 1 to l - 1 do pp ppf " %02X" (Char.code (bs.[i])) done; pp ppf ")@]" type decoder = { src : src; (* input source. *) mutable encoding : decoder_encoding; (* decoded encoding. *) nln : nln option; (* newline normalization (if any). *) nl : int; (* newline normalization character. *) mutable i : string; (* current input chunk. *) mutable i_pos : int; (* input current position. *) mutable i_max : int; (* input maximal position. *) t : string; (* four bytes temporary buffer for overlapping reads. *) mutable t_len : int; (* current byte length of [t]. *) mutable t_need : int; (* number of bytes needed in [t]. *) mutable removed_bom : bool; (* [true] if an initial BOM was removed. *) mutable last_cr : bool; (* [true] if last char was CR. *) mutable line : int; (* line number. *) mutable col : int; (* column number. *) mutable byte_count : int; (* byte count. *) mutable count : int; (* char count. *) mutable pp : (* decoder post-processor for BOM, position and nln. *) decoder -> [ `Malformed of string | `Uchar of uchar ] -> decode; mutable k : decoder -> decode } (* decoder continuation. *) (* On decodes that overlap two (or more) [d.i] buffers, we use [t_fill] to copy the input data to [d.t] and decode from there. If the [d.i] buffers are not too small this is faster than continuation based byte per byte writes. End of input (eoi) is signalled by [d.i_pos = 0] and [d.i_max = min_int] which implies that [i_rem d < 0] is [true]. *) let i_rem d = d.i_max - d.i_pos + 1 (* remaining bytes to read in [d.i]. *) let eoi d = d.i <- ""; d.i_pos <- 0; d.i_max <- min_int (* set eoi in [d]. *) let src d s j l = (* set [d.i] with [s]. *) if (j < 0 || l < 0 || j + l > String.length s) then invalid_bounds j l else if (l = 0) then eoi d else (d.i <- s; d.i_pos <- j; d.i_max <- j + l - 1) let refill k d = match d.src with (* get new input in [d.i] and [k]ontinue. *) | `Manual -> d.k <- k; `Await | `String _ -> eoi d; k d | `Channel ic -> let rc = input ic d.i 0 (String.length d.i) in (src d d.i 0 rc; k d) let t_need d need = d.t_len <- 0; d.t_need <- need let rec t_fill k d = (* get [d.t_need] bytes (or less if eoi) in [i.t]. *) let blit d l = unsafe_blit d.i d.i_pos d.t d.t_len (* write pos. *) l; d.i_pos <- d.i_pos + l; d.t_len <- d.t_len + l; in let rem = i_rem d in if rem < 0 (* eoi *) then k d else let need = d.t_need - d.t_len in if rem < need then (blit d rem; refill (t_fill k) d) else (blit d need; k d) let ret k v byte_count d = (* return post-processed [v]. *) d.k <- k; d.byte_count <- d.byte_count + byte_count; d.pp d v (* Decoders. *) let rec decode_us_ascii d = let rem = i_rem d in if rem <= 0 then (if rem < 0 then `End else refill decode_us_ascii d) else let j = d.i_pos in d.i_pos <- d.i_pos + 1; ret decode_us_ascii (r_us_ascii d.i j) 1 d let rec decode_iso_8859_1 d = let rem = i_rem d in if rem <= 0 then (if rem < 0 then `End else refill decode_iso_8859_1 d) else let j = d.i_pos in d.i_pos <- d.i_pos + 1; ret decode_iso_8859_1 (r_iso_8859_1 d.i j) 1 d (* UTF-8 decoder *) let rec t_decode_utf_8 d = (* decode from [d.t]. *) if d.t_len < d.t_need then ret decode_utf_8 (malformed d.t 0 d.t_len) d.t_len d else ret decode_utf_8 (r_utf_8 d.t 0 d.t_len) d.t_len d and decode_utf_8 d = let rem = i_rem d in if rem <= 0 then (if rem < 0 then `End else refill decode_utf_8 d) else let need = unsafe_array_get utf_8_len (unsafe_byte d.i d.i_pos) in if rem < need then (t_need d need; t_fill t_decode_utf_8 d) else let j = d.i_pos in if need = 0 then (d.i_pos <- d.i_pos + 1; ret decode_utf_8 (malformed d.i j 1) 1 d) else (d.i_pos <- d.i_pos + need; ret decode_utf_8 (r_utf_8 d.i j need) need d) (* UTF-16BE decoder *) let rec t_decode_utf_16be_lo hi d = (* decode from [d.t]. *) let bcount = d.t_len + 2 (* hi count *) in if d.t_len < d.t_need then ret decode_utf_16be (malformed_pair true hi d.t 0 d.t_len) bcount d else ret decode_utf_16be (r_utf_16_lo hi d.t 0 1) bcount d and t_decode_utf_16be d = (* decode from [d.t]. *) if d.t_len < d.t_need then ret decode_utf_16be (malformed d.t 0 d.t_len) d.t_len d else decode_utf_16be_lo (r_utf_16 d.t 0 1) d and decode_utf_16be_lo v d = match v with | `Uchar _ | `Malformed _ as v -> ret decode_utf_16be v 2 d | `Hi hi -> let rem = i_rem d in if rem < 2 then (t_need d 2; t_fill (t_decode_utf_16be_lo hi) d) else let j = d.i_pos in d.i_pos <- d.i_pos + 2; ret decode_utf_16be (r_utf_16_lo hi d.i j (j + 1)) 4 d and decode_utf_16be d = let rem = i_rem d in if rem <= 0 then (if rem < 0 then `End else refill decode_utf_16be d) else if rem < 2 then (t_need d 2; t_fill t_decode_utf_16be d) else let j = d.i_pos in d.i_pos <- d.i_pos + 2; decode_utf_16be_lo (r_utf_16 d.i j (j + 1)) d (* UTF-16LE decoder, same as UTF-16BE with byte swapped. *) let rec t_decode_utf_16le_lo hi d = (* decode from [d.t]. *) let bcount = d.t_len + 2 (* hi count *) in if d.t_len < d.t_need then ret decode_utf_16le (malformed_pair false hi d.t 0 d.t_len) bcount d else ret decode_utf_16le (r_utf_16_lo hi d.t 1 0) bcount d and t_decode_utf_16le d = (* decode from [d.t]. *) if d.t_len < d.t_need then ret decode_utf_16le (malformed d.t 0 d.t_len) d.t_len d else decode_utf_16le_lo (r_utf_16 d.t 1 0) d and decode_utf_16le_lo v d = match v with | `Uchar _ | `Malformed _ as v -> ret decode_utf_16le v 2 d | `Hi hi -> let rem = i_rem d in if rem < 2 then (t_need d 2; t_fill (t_decode_utf_16le_lo hi) d) else let j = d.i_pos in d.i_pos <- d.i_pos + 2; ret decode_utf_16le (r_utf_16_lo hi d.i (j + 1) j) 4 d and decode_utf_16le d = let rem = i_rem d in if rem <= 0 then (if rem < 0 then `End else refill decode_utf_16le d) else if rem < 2 then (t_need d 2; t_fill t_decode_utf_16le d) else let j = d.i_pos in d.i_pos <- d.i_pos + 2; decode_utf_16le_lo (r_utf_16 d.i (j + 1) j) d (* Encoding guessing. The guess is simple but starting the decoder after is tedious, uutf's decoders are not designed to put bytes back in the stream. *) let guessed_utf_8 d = (* start decoder after `UTF_8 guess. *) let b3 d = (* handles the third read byte. *) let b3 = unsafe_byte d.t 2 in match utf_8_len.(b3) with | 0 -> ret decode_utf_8 (malformed d.t 2 1) 1 d | n -> d.t_need <- n; d.t_len <- 1; unsafe_set_byte d.t 0 b3; t_fill t_decode_utf_8 d in let b2 d = (* handle second read byte. *) let b2 = unsafe_byte d.t 1 in let b3 = if d.t_len > 2 then b3 else decode_utf_8 (* decodes `End *) in match utf_8_len.(b2) with | 0 -> ret b3 (malformed d.t 1 1) 1 d | 1 -> ret b3 (r_utf_8 d.t 1 1) 1 d | n -> (* copy d.t.(1-2) to d.t.(0-1) and decode *) d.t_need <- n; unsafe_set_byte d.t 0 b2; if (d.t_len < 3) then d.t_len <- 1 else (d.t_len <- 2; unsafe_set_byte d.t 1 (unsafe_byte d.t 2); ); t_fill t_decode_utf_8 d in let b1 = unsafe_byte d.t 0 in (* handle first read byte. *) let b2 = if d.t_len > 1 then b2 else decode_utf_8 (* decodes `End *) in match utf_8_len.(b1) with | 0 -> ret b2 (malformed d.t 0 1) 1 d | 1 -> ret b2 (r_utf_8 d.t 0 1) 1 d | 2 -> if d.t_len < 2 then ret decode_utf_8 (malformed d.t 0 1) 1 d else if d.t_len < 3 then ret decode_utf_8 (r_utf_8 d.t 0 2) 2 d else ret b3 (r_utf_8 d.t 0 2) 2 d | 3 -> if d.t_len < 3 then ret decode_utf_8 (malformed d.t 0 d.t_len) d.t_len d else ret decode_utf_8 (r_utf_8 d.t 0 3) 3 d | 4 -> if d.t_len < 3 then ret decode_utf_8 (malformed d.t 0 d.t_len) d.t_len d else (d.t_need <- 4; t_fill t_decode_utf_8 d) | n -> assert false let guessed_utf_16 d be v = (* start decoder after `UTF_16{BE,LE} guess. *) let decode_utf_16, t_decode_utf_16, t_decode_utf_16_lo, j0, j1 = if be then decode_utf_16be, t_decode_utf_16be, t_decode_utf_16be_lo, 0, 1 else decode_utf_16le, t_decode_utf_16le, t_decode_utf_16le_lo, 1, 0 in let b3 k d = if d.t_len < 3 then decode_utf_16 d (* decodes `End *) else begin (* copy d.t.(2) to d.t.(0) and decode. *) d.t_need <- 2; d.t_len <- 1; unsafe_set_byte d.t 0 (unsafe_byte d.t 2); t_fill k d end in match v with | `BOM -> ret (b3 t_decode_utf_16) (`Uchar u_bom) 2 d | `ASCII u -> ret (b3 t_decode_utf_16) (`Uchar u) 2 d | `Decode -> match r_utf_16 d.t j0 j1 with | `Malformed _ | `Uchar _ as v -> ret (b3 t_decode_utf_16) v 2 d | `Hi hi -> if d.t_len < 3 then ret decode_utf_16 (malformed_pair be hi "" 0 0) d.t_len d else (b3 (t_decode_utf_16_lo hi)) d let guess_encoding d = (* guess encoding and start decoder. *) let setup d = match r_encoding d.t 0 d.t_len with | `UTF_8 r -> d.encoding <- `UTF_8; d.k <- decode_utf_8; begin match r with | `BOM -> ret decode_utf_8 (`Uchar u_bom) 3 d | `Decode -> guessed_utf_8 d | `End -> `End end | `UTF_16BE r -> d.encoding <- `UTF_16BE; d.k <- decode_utf_16be; guessed_utf_16 d true r | `UTF_16LE r -> d.encoding <- `UTF_16LE; d.k <- decode_utf_16le; guessed_utf_16 d false r in (t_need d 3; t_fill setup d) (* Character post-processors. Used for BOM handling, newline normalization and position tracking. The [pp_remove_bom] is only used for the first character to remove a possible initial BOM and handle UTF-16 endianness recognition. *) let nline d = d.col <- 0; d.line <- d.line + 1 (* inlined. *) let ncol d = d.col <- d.col + 1 (* inlined. *) let ncount d = d.count <- d.count + 1 (* inlined. *) let cr d b = d.last_cr <- b (* inlined. *) let pp_remove_bom utf16 pp d = function(* removes init. BOM, handles UTF-16. *) | `Uchar 0xFEFF (* BOM *) -> if utf16 then (d.encoding <- `UTF_16BE; d.k <- decode_utf_16be); d.removed_bom <- true; d.pp <- pp; d.k d | `Uchar 0xFFFE (* BOM reversed from decode_utf_16be *) when utf16 -> d.encoding <- `UTF_16LE; d.k <- decode_utf_16le; d.removed_bom <- true; d.pp <- pp; d.k d | `Malformed _ | `Uchar _ as v -> d.removed_bom <- false; d.pp <- pp; d.pp d v let pp_nln_none d = function | `Uchar 0x000A (* LF *) as v -> let last_cr = d.last_cr in cr d false; ncount d; if last_cr then v else (nline d; v) | `Uchar 0x000D (* CR *) as v -> cr d true; ncount d; nline d; v | `Uchar (0x0085 | 0x000C | 0x2028 | 0x2029) (* NEL | FF | LS | PS *) as v -> cr d false; ncount d; nline d; v | `Uchar _ | `Malformed _ as v -> cr d false; ncount d; ncol d; v let pp_nln_readline d = function | `Uchar 0x000A (* LF *) -> let last_cr = d.last_cr in cr d false; if last_cr then d.k d else (ncount d; nline d; `Uchar d.nl) | `Uchar 0x000D (* CR *) -> cr d true; ncount d; nline d; `Uchar d.nl | `Uchar (0x0085 | 0x000C | 0x2028 | 0x2029) (* NEL | FF | LS | PS *) -> cr d false; ncount d; nline d; `Uchar d.nl | `Uchar _ | `Malformed _ as v -> cr d false; ncount d; ncol d; v let pp_nln_nlf d = function | `Uchar 0x000A (* LF *) -> let last_cr = d.last_cr in cr d false; if last_cr then d.k d else (ncount d; nline d; `Uchar d.nl) | `Uchar 0x000D (* CR *) -> cr d true; ncount d; nline d; `Uchar d.nl | `Uchar 0x0085 (* NEL *) -> cr d false; ncount d; nline d; `Uchar d.nl | `Uchar (0x000C | 0x2028 | 0x2029) as v (* FF | LS | PS *) -> cr d false; ncount d; nline d; v | `Uchar _ | `Malformed _ as v -> cr d false; ncount d; ncol d; v let pp_nln_ascii d = function | `Uchar 0x000A (* LF *) -> let last_cr = d.last_cr in cr d false; if last_cr then d.k d else (ncount d; nline d; `Uchar d.nl) | `Uchar 0x000D (* CR *) -> cr d true; ncount d; nline d; `Uchar d.nl | `Uchar (0x0085 | 0x000C | 0x2028 | 0x2029) as v (* NEL | FF | LS | PS *) -> cr d false; ncount d; nline d; v | `Uchar _ | `Malformed _ as v -> cr d false; ncount d; ncol d; v let decode_fun = function | `UTF_8 -> decode_utf_8 | `UTF_16 -> decode_utf_16be (* see [pp_remove_bom]. *) | `UTF_16BE -> decode_utf_16be | `UTF_16LE -> decode_utf_16le | `US_ASCII -> decode_us_ascii | `ISO_8859_1 -> decode_iso_8859_1 let decoder ?nln ?encoding src = let pp, nl = match nln with | None -> pp_nln_none, 0x000A (* not used. *) | Some (`ASCII nl) -> pp_nln_ascii, nl | Some (`NLF nl) -> pp_nln_nlf, nl | Some (`Readline nl) -> pp_nln_readline, nl in let encoding, k = match encoding with | None -> `UTF_8, guess_encoding | Some e -> (e :> decoder_encoding), decode_fun e in let i, i_pos, i_max = match src with | `Manual -> "", 1, 0 (* implies src_rem d = 0. *) | `Channel _ -> String.create io_buffer_size, 1, 0 (* idem. *) | `String s -> s, 0, String.length s - 1 in { src = (src :> src); encoding; nln = (nln :> nln option); nl; i; i_pos; i_max; t = String.create 4; t_len = 0; t_need = 0; removed_bom = false; last_cr = false; line = 1; col = 0; byte_count = 0; count = 0; pp = pp_remove_bom (encoding = `UTF_16) pp; k } let decode d = d.k d let decoder_line d = d.line let decoder_col d = d.col let decoder_byte_count d = d.byte_count let decoder_count d = d.count let decoder_removed_bom d = d.removed_bom let decoder_src d = d.src let decoder_nln d = d.nln let decoder_encoding d = d.encoding let set_decoder_encoding d e = d.encoding <- (e :> decoder_encoding); d.k <- decode_fun e (* Encode *) type dst = [ `Channel of out_channel | `Buffer of Buffer.t | `Manual ] type encode = [ `Await | `End | `Uchar of uchar ] type encoder = { dst : dst; (* output destination. *) encoding : encoding; (* encoded encoding. *) mutable o : string; (* current output chunk. *) mutable o_pos : int; (* next output position to write. *) mutable o_max : int; (* maximal output position to write. *) t : string; (* four bytes buffer for overlapping writes. *) mutable t_pos : int; (* next position to read in [t]. *) mutable t_max : int; (* maximal position to read in [t]. *) mutable k : (* encoder continuation. *) encoder -> encode -> [ `Ok | `Partial ] } (* On encodes that overlap two (or more) [e.o] buffers, we encode the character to the temporary buffer [o.t] and continue with [tmp_flush] to write this data on the different [e.o] buffers. If the [e.o] buffers are not too small this is faster than continuation based byte per byte writes. *) let o_rem e = e.o_max - e.o_pos + 1 (* remaining bytes to write in [e.o]. *) let dst e s j l = (* set [e.o] with [s]. *) if (j < 0 || l < 0 || j + l > String.length s) then invalid_bounds j l; e.o <- s; e.o_pos <- j; e.o_max <- j + l - 1 let partial k e = function `Await -> k e | `Uchar _ | `End -> invalid_encode () let flush k e = match e.dst with(* get free storage in [d.o] and [k]ontinue. *) | `Manual -> e.k <- partial k; `Partial | `Buffer b -> Buffer.add_substring b e.o 0 e.o_pos; e.o_pos <- 0; k e | `Channel oc -> output oc e.o 0 e.o_pos; e.o_pos <- 0; k e let t_range e max = e.t_pos <- 0; e.t_max <- max let rec t_flush k e = (* flush [d.t] up to [d.t_max] in [d.i]. *) let blit e l = unsafe_blit e.t e.t_pos e.o e.o_pos l; e.o_pos <- e.o_pos + l; e.t_pos <- e.t_pos + l in let rem = o_rem e in let len = e.t_max - e.t_pos + 1 in if rem < len then (blit e rem; flush (t_flush k) e) else (blit e len; k e) (* Encoders. *) let rec encode_utf_8 e v = let k e = e.k <- encode_utf_8; `Ok in match v with | `Await -> k e | `End -> flush k e | `Uchar u as v -> let rem = o_rem e in if u <= 0x007F then if rem < 1 then flush (fun e -> encode_utf_8 e v) e else (unsafe_set_byte e.o e.o_pos u; e.o_pos <- e.o_pos + 1; k e) else if u <= 0x07FF then begin let s, j, k = if rem < 2 then (t_range e 1; e.t, 0, t_flush k) else let j = e.o_pos in (e.o_pos <- e.o_pos + 2; e.o, j, k) in unsafe_set_byte s j (0xC0 lor (u lsr 6)); unsafe_set_byte s (j + 1) (0x80 lor (u land 0x3F)); k e end else if u <= 0xFFFF then begin let s, j, k = if rem < 3 then (t_range e 2; e.t, 0, t_flush k) else let j = e.o_pos in (e.o_pos <- e.o_pos + 3; e.o, j, k) in unsafe_set_byte s j (0xE0 lor (u lsr 12)); unsafe_set_byte s (j + 1) (0x80 lor ((u lsr 6) land 0x3F)); unsafe_set_byte s (j + 2) (0x80 lor (u land 0x3F)); k e end else begin let s, j, k = if rem < 4 then (t_range e 3; e.t, 0, t_flush k) else let j = e.o_pos in (e.o_pos <- e.o_pos + 4; e.o, j, k) in unsafe_set_byte s j (0xF0 lor (u lsr 18)); unsafe_set_byte s (j + 1) (0x80 lor ((u lsr 12) land 0x3F)); unsafe_set_byte s (j + 2) (0x80 lor ((u lsr 6) land 0x3F)); unsafe_set_byte s (j + 3) (0x80 lor (u land 0x3F)); k e end let rec encode_utf_16be e v = let k e = e.k <- encode_utf_16be; `Ok in match v with | `Await -> k e | `End -> flush k e | `Uchar u -> let rem = o_rem e in if u < 0x10000 then begin let s, j, k = if rem < 2 then (t_range e 1; e.t, 0, t_flush k) else let j = e.o_pos in (e.o_pos <- e.o_pos + 2; e.o, j, k) in unsafe_set_byte s j (u lsr 8); unsafe_set_byte s (j + 1) (u land 0xFF); k e end else begin let s, j, k = if rem < 4 then (t_range e 3; e.t, 0, t_flush k) else let j = e.o_pos in (e.o_pos <- e.o_pos + 4; e.o, j, k) in let u' = u - 0x10000 in let hi = (0xD800 lor (u' lsr 10)) in let lo = (0xDC00 lor (u' land 0x3FF)) in unsafe_set_byte s j (hi lsr 8); unsafe_set_byte s (j + 1) (hi land 0xFF); unsafe_set_byte s (j + 2) (lo lsr 8); unsafe_set_byte s (j + 3) (lo land 0xFF); k e end let rec encode_utf_16le e v = (* encode_uft_16be with bytes swapped. *) let k e = e.k <- encode_utf_16le; `Ok in match v with | `Await -> k e | `End -> flush k e | `Uchar u -> let rem = o_rem e in if u < 0x10000 then begin let s, j, k = if rem < 2 then (t_range e 1; e.t, 0, t_flush k) else let j = e.o_pos in (e.o_pos <- e.o_pos + 2; e.o, j, k) in unsafe_set_byte s j (u land 0xFF); unsafe_set_byte s (j + 1) (u lsr 8); k e end else begin let s, j, k = if rem < 4 then (t_range e 3; e.t, 0, t_flush k) else let j = e.o_pos in (e.o_pos <- e.o_pos + 4; e.o, j, k) in let u' = u - 0x10000 in let hi = (0xD800 lor (u' lsr 10)) in let lo = (0xDC00 lor (u' land 0x3FF)) in unsafe_set_byte s j (hi land 0xFF); unsafe_set_byte s (j + 1) (hi lsr 8); unsafe_set_byte s (j + 2) (lo land 0xFF); unsafe_set_byte s (j + 3) (lo lsr 8); k e end let encode_fun = function | `UTF_8 -> encode_utf_8 | `UTF_16 -> encode_utf_16be | `UTF_16BE -> encode_utf_16be | `UTF_16LE -> encode_utf_16le let encoder encoding dst = let o, o_pos, o_max = match dst with | `Manual -> "", 1, 0 (* implies o_rem e = 0. *) | `Buffer _ | `Channel _ -> String.create io_buffer_size, 0, io_buffer_size - 1 in { dst = (dst :> dst); encoding = (encoding :> encoding); o; o_pos; o_max; t = String.create 4; t_pos = 1; t_max = 0; k = encode_fun encoding} let encode e v = e.k e (v :> encode) let encoder_encoding e = e.encoding let encoder_dst e = e.dst (* Manual sources and destinations. *) module Manual = struct let src = src let dst = dst let dst_rem = o_rem end (* Strings folders and Buffer encoders *) module String = struct let encoding_guess s = match r_encoding s 0 (max (String.length s) 3) with | `UTF_8 d -> `UTF_8, (d = `BOM) | `UTF_16BE d -> `UTF_16BE, (d = `BOM) | `UTF_16LE d -> `UTF_16LE, (d = `BOM) type 'a folder = 'a -> int -> [ `Uchar of uchar | `Malformed of string ] -> 'a let fold_utf_8 f acc s = let rec loop acc f s i l = if i = l then acc else let need = unsafe_array_get utf_8_len (unsafe_byte s i) in if need = 0 then loop (f acc i (malformed s i 1)) f s (i + 1) l else let rem = l - i in if rem < need then f acc i (malformed s i rem) else loop (f acc i (r_utf_8 s i need)) f s (i + need) l in loop acc f s 0 (String.length s) let fold_utf_16be f acc s = let rec loop acc f s i l = if i = l then acc else let rem = l - i in if rem < 2 then f acc i (malformed s i 1) else match r_utf_16 s i (i + 1) with | `Uchar _ | `Malformed _ as v -> loop (f acc i v) f s (i + 2) l | `Hi hi -> if rem < 4 then f acc i (malformed s i rem) else loop (f acc i (r_utf_16_lo hi s (i + 2) (i + 3))) f s (i + 4) l in loop acc f s 0 (String.length s) let fold_utf_16le f acc s = (* [fold_utf_16be], bytes swapped. *) let rec loop acc f s i l = if i = l then acc else let rem = l - i in if rem < 2 then f acc i (malformed s i 1) else match r_utf_16 s (i + 1) i with | `Uchar _ | `Malformed _ as v -> loop (f acc i v) f s (i + 2) l | `Hi hi -> if rem < 4 then f acc i (malformed s i rem) else loop (f acc i (r_utf_16_lo hi s (i + 3) (i + 2))) f s (i + 4) l in loop acc f s 0 (String.length s) end module Buffer = struct let add_utf_8 b u = let w byte = Buffer.add_char b (unsafe_chr byte) in (* inlined. *) if u <= 0x007F then (w u) else if u <= 0x07FF then (w (0xC0 lor (u lsr 6)); w (0x80 lor (u land 0x3F))) else if u <= 0xFFFF then (w (0xE0 lor (u lsr 12)); w (0x80 lor ((u lsr 6) land 0x3F)); w (0x80 lor (u land 0x3F))) else (w (0xF0 lor (u lsr 18)); w (0x80 lor ((u lsr 12) land 0x3F)); w (0x80 lor ((u lsr 6) land 0x3F)); w (0x80 lor (u land 0x3F))) let add_utf_16be b u = let w byte = Buffer.add_char b (unsafe_chr byte) in (* inlined. *) if u < 0x10000 then (w (u lsr 8); w (u land 0xFF)) else let u' = u - 0x10000 in let hi = (0xD800 lor (u' lsr 10)) in let lo = (0xDC00 lor (u' land 0x3FF)) in w (hi lsr 8); w (hi land 0xFF); w (lo lsr 8); w (lo land 0xFF) let add_utf_16le b u = (* swapped add_utf_16be. *) let w byte = Buffer.add_char b (unsafe_chr byte) in (* inlined. *) if u < 0x10000 then (w (u land 0xFF); w (u lsr 8)) else let u' = u - 0x10000 in let hi = (0xD800 lor (u' lsr 10)) in let lo = (0xDC00 lor (u' land 0x3FF)) in w (hi land 0xFF); w (hi lsr 8); w (lo land 0xFF); w (lo lsr 8) end (*--------------------------------------------------------------------------- Copyright 2012 Daniel C. Bünzli All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of Daniel C. Bünzli nor the names of contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ---------------------------------------------------------------------------*)