From 688757cf4a7cbeb3e29333d9e746d68a5e2b0c6f Mon Sep 17 00:00:00 2001 From: "Hezekiah M. Carty" Date: Sun, 26 Apr 2020 13:04:49 -0600 Subject: [PATCH] Experimenting with decompress --- dune-project | 2 +- ezgzip.opam | 2 +- src/dune | 3 +- src/ezgzip.ml | 290 ++++++++++--------------------------------------- src/ezgzip.mli | 50 ++++----- test/dune | 2 +- test/test.ml | 27 ++--- 7 files changed, 90 insertions(+), 286 deletions(-) diff --git a/dune-project b/dune-project index ea97083..197eecd 100644 --- a/dune-project +++ b/dune-project @@ -1,2 +1,2 @@ -(lang dune 1.0) +(lang dune 1.1) (name ezgzip) diff --git a/ezgzip.opam b/ezgzip.opam index dc23dde..0f01fdc 100644 --- a/ezgzip.opam +++ b/ezgzip.opam @@ -41,7 +41,7 @@ depends: [ "alcotest" {with-test & >= "0.8.1"} "astring" "benchmark" {with-test & >= "1.4"} - "dune" {>= "1.0"} + "dune" {>= "1.1"} "ocplib-endian" "odoc" {with-doc & >= "1.1.1"} "qcheck" {with-test & >= "0.7"} diff --git a/src/dune b/src/dune index 5695db5..1e0dd8f 100644 --- a/src/dune +++ b/src/dune @@ -1,4 +1,3 @@ (library - (name ezgzip) (public_name ezgzip) - (libraries astring ocplib-endian rresult camlzip)) + (libraries bigstringaf decompress.gz)) diff --git a/src/ezgzip.ml b/src/ezgzip.ml index e043ced..26d83fc 100644 --- a/src/ezgzip.ml +++ b/src/ezgzip.ml @@ -1,235 +1,55 @@ -open Astring -open Rresult - -type error = - | Truncated of string - | Invalid_format - | Compression_error of string - | Size of {got: int; expected: int} - | Checksum - -let pp_error fmt error = - match error with - | Truncated content -> - Format.fprintf fmt "Truncated content after %d bytes" - (String.length content) - | Invalid_format -> Format.fprintf fmt "Invalid gzip format" - | Compression_error msg -> Format.fprintf fmt "Compression error: %s" msg - | Size {got; expected} -> - Format.fprintf fmt - "Size mismatch after decompression: got %d, expected %d" got expected - | Checksum -> Format.fprintf fmt "Invalid checksum after decompression" - - -let pp_gzip_error fmt wrapped = - let `Gzip error = wrapped in - pp_error fmt error - - -let error e = R.error (`Gzip e) - -module Z = struct - type error = Truncated of string | Compression_error of string - - let pp_error fmt error = - match error with - | Truncated content -> - Format.fprintf fmt "Truncated content after %d bytes" - (String.length content) - | Compression_error msg -> Format.fprintf fmt "Compression error: %s" msg - - - let pp_zlib_error fmt wrapped = - let `Zlib error = wrapped in - pp_error fmt error - - - let error e = R.error (`Zlib e) - - let compress_zlib ?level ?(header= false) input output = - let pos = ref 0 in - let length = String.length input in - let feed buf = - let bytes = min (Bytes.length buf) (length - !pos) in - Bytes.blit_string input !pos buf 0 bytes ; - pos := !pos + bytes ; - bytes - in - Zlib.compress ?level ~header feed output - - - let uncompress_zlib ?(header= false) input output = - let pos = ref 0 in - let length = String.length input in - let feed buf = - let bytes = min (Bytes.length buf) (length - !pos) in - Bytes.blit_string input !pos buf 0 bytes ; - pos := !pos + bytes ; - bytes - in - Zlib.uncompress ~header feed output - - - let compress ?level ?header input = - let compressed = Buffer.create 1_024 in - let output buffer length = - Buffer.add_subbytes compressed buffer 0 length - in - compress_zlib ?level ?header input output ; - Buffer.contents compressed - - - let decompress ?header ?(max_size= Sys.max_string_length) input = - let size = ref 0 in - let uncompressed = Buffer.create 1_024 in - let output buffer length = - size := !size + length ; - if !size < 0 then - invalid_arg "Ezgzip: output larger than max string length" ; - if !size > max_size then raise Exit - else Buffer.add_subbytes uncompressed buffer 0 length - in - try - uncompress_zlib ?header input output ; - Ok (Buffer.contents uncompressed) - with - | Exit -> error (Truncated (Buffer.contents uncompressed)) - | Zlib.Error (func, msg) -> - let message = Format.asprintf "in %s: %s" func msg in - error (Compression_error message) -end - -let id1_id2 = "\x1f‹" - -(* XXX: Hard-coded gzip header may not be the best idea... *) -let header = - let compression_method = "\x08" in - let flags1 = "\x00" in - let time = "\x00\x00\x00\x00" in - let flags2 = "\x00" in - let os = "ÿ" in - String.concat [id1_id2; compression_method; flags1; time; flags2; os] - - -let header_size = - let bytes = String.length header in - assert (bytes = 10) ; - bytes - - -let footer_size = 8 - -let compress ?level raw = - ( match level with - | None -> () - | Some i -> - if i < 0 || i > 9 then - invalid_arg - (strf "Ezgzip.compress: invalid level %d - must be between 0 and 9" i) - ) ; - let int32_to_bytestring i = - let buf = Bytes.create 4 in - EndianString.LittleEndian.set_int32 buf 0 i ; - Bytes.to_string buf - in - let compressed = Z.compress ?level raw in - let length = String.length raw in - let crc32 = Zlib.update_crc_string 0l raw 0 length in - let crc32_checksum = int32_to_bytestring crc32 in - let original_size = - int32_to_bytestring (Int32.of_int (length mod 0x1_0000_0000)) - in - String.concat [header; compressed; crc32_checksum; original_size] - - -type flags = {text: bool; crc16: bool; extra: bool; name: bool; comment: bool} - -let flags_of_int i = - let bit x = i land x = x in - if bit 32 || bit 64 || bit 128 then error Invalid_format - else - Ok {text= bit 1; crc16= bit 2; extra= bit 4; name= bit 8; comment= bit 16} - - -type t = {compressed: string; crc32: int32; original_size: int} - -let extra_content_length raw flags = - let extra_bytes = ref 0 in - let offset () = !extra_bytes + header_size in - ( if flags.extra then - let xlen = EndianString.LittleEndian.get_int16 raw (offset ()) in - extra_bytes := !extra_bytes + xlen + 2 ) ; - ( if flags.name then - let sub = String.sub_with_range ~first:(offset ()) raw in - let name = String.Sub.take ~sat:(fun c -> c <> '\x00') sub in - extra_bytes := !extra_bytes + String.Sub.length name + 1 ) ; - ( if flags.comment then - let sub = String.sub_with_range ~first:(offset ()) raw in - let comment = String.Sub.take ~sat:(fun c -> c <> '\x00') sub in - extra_bytes := !extra_bytes + String.Sub.length comment + 1 ) ; - if flags.crc16 then extra_bytes := !extra_bytes + 2 ; - !extra_bytes - - -let parse_gzip_bytes raw = - (* XXX: Ignoring most of the header may not be the best idea... *) - let ( >>= ) = R.( >>= ) in - (* Make sure we have enough bytes to work with *) - ( if String.length raw < header_size + footer_size then error Invalid_format - else Ok () ) - >>= fun () -> - (* Check magic bytes *) - (if String.is_prefix ~affix:id1_id2 raw then Ok () else error Invalid_format) - >>= fun () -> - (* Parse flags *) - flags_of_int (Char.to_int raw.[3]) - >>= fun flags -> - (* Calculate the extra content size so we can skip it *) - ( match extra_content_length raw flags with - | length -> Ok length - | exception Exit -> error Invalid_format ) - >>= fun extra_size -> - (* Make sure we actually have data left over *) - let compressed_length = - String.length raw - header_size - footer_size - extra_size - in - (if compressed_length >= 0 then Ok () else error Invalid_format) - >>= fun () -> - let compressed = - String.with_range ~first:(header_size + extra_size) ~len:compressed_length - raw - in - let crc32 = - EndianString.LittleEndian.get_int32 raw (String.length raw - 4 - 4) - in - let original_size = - let size = - EndianString.LittleEndian.get_int32 raw (String.length raw - 4) - in - Int32.to_int size land 0xffff_ffff - in - Ok {compressed; crc32; original_size} - - -let decompress ?(ignore_size= false) ?(ignore_checksum= false) ?max_size raw = - let ( >>= ) = R.( >>= ) in - parse_gzip_bytes raw - >>= fun {compressed; crc32; original_size} -> - ( match Z.decompress ?max_size compressed with - | Ok uncompressed -> Ok uncompressed - | Error `Zlib Truncated uncompressed -> error (Truncated uncompressed) - | Error `Zlib Compression_error message -> error (Compression_error message) - ) - >>= fun uncompressed -> - if not ignore_size - && String.length uncompressed mod 0x1_0000_0000 <> original_size - then - R.error - (`Gzip (Size {got= String.length uncompressed; expected= original_size})) - else - let crc32_calculated () = - Zlib.update_crc_string 0l uncompressed 0 (String.length uncompressed) - in - if not ignore_checksum && crc32_calculated () <> crc32 then - R.error (`Gzip Checksum) - else Ok uncompressed +type error = Invalid_content of string | Truncated of string + +let error (e : error) = Error (`Gzip e) + +let pp_gzip_error ppf (`Gzip (e : error)) = + match e with + | Invalid_content msg -> Format.fprintf ppf "Invalid GZIP content: %s" msg + | Truncated msg -> Format.fprintf ppf "Truncated gzip content: %s" msg + +exception Exceeded_max_size + +let default_buffer_size = ref 65_536 + +let refill s = + let src_pos = ref 0 in + fun bs -> + let len = min (Bigstringaf.length bs) (String.length s - !src_pos) in + Bigstringaf.blit_from_string s ~src_off:!src_pos bs ~dst_off:0 ~len; + src_pos := !src_pos + len; + len + +let flush ~max_size decompressed_buf scratch_buf src len = + if Buffer.length decompressed_buf + len > max_size then + raise Exceeded_max_size + else ( + Bigstringaf.blit_to_bytes src ~src_off:0 scratch_buf ~dst_off:0 ~len; + Buffer.add_subbytes decompressed_buf scratch_buf 0 len ) + +let make_io_bigstrings buffer_size = + (Bigstringaf.create buffer_size, Bigstringaf.create buffer_size) + +let uncompress ?(max_size = Sys.max_string_length) + ?(buffer_size = !default_buffer_size) gz = + let i, o = make_io_bigstrings buffer_size in + let refill = refill gz in + let buffer = Buffer.create buffer_size in + let scratch = Bytes.create buffer_size in + let flush = flush ~max_size buffer scratch in + match Gz.Higher.uncompress ~i ~o ~refill ~flush with + | Ok _meta -> Ok (Buffer.contents buffer) + | Error (`Msg m) -> error (Invalid_content m) + | exception Exceeded_max_size -> error (Truncated (Buffer.contents buffer)) + +let compress ?level ?(buffer_size = !default_buffer_size) raw = + let i, o = make_io_bigstrings buffer_size in + let w = De.make_window ~bits:15 in + let q = De.Queue.create 4_096 in + let refill = refill raw in + let buffer = Buffer.create buffer_size in + let scratch = Bytes.create buffer_size in + let flush = flush ~max_size:Sys.max_string_length buffer scratch in + let mtime () = 0l in + let config = Gz.Higher.configuration Unknown mtime in + Gz.Higher.compress ?level ~w ~q ~i ~o ~refill ~flush () config; + Buffer.contents buffer diff --git a/src/ezgzip.mli b/src/ezgzip.mli index 9bbeef1..bf35839 100644 --- a/src/ezgzip.mli +++ b/src/ezgzip.mli @@ -1,36 +1,24 @@ -(** {1 gzip compression} *) +(** {1 Simple gzip (de)compression} *) -(** Possible error cases *) -type error = - | Truncated of string - (** Extracted size is greater than the allowed maximum size *) - | Invalid_format (** Invalid data format *) - | Compression_error of string (** zlib error *) - | Size of {got: int; expected: int} - (** Extracted size does not match what was expected based on the source - metadata *) - | Checksum - (** Extracted content checksum does not match what was expected based on - the source metadata *) +type error = Invalid_content of string | Truncated of string -val compress : ?level:int -> string -> string +val pp_gzip_error : Format.formatter -> [ `Gzip of error] -> unit + +val compress : ?level:int -> ?buffer_size:int -> string -> string (** [compress src] returns a gzip-compressed version of [src]. @param level can use used to set the compression level from [0] (no - compression) to [9] (highest compression). + compression) to [3] (highest compression). - @raise Invalid_argument if [level] is outside of the range 0 to 9. *) + @raise Invalid_argument if [level] is outside of the range 0 to 3. *) -val decompress : - ?ignore_size:bool -> ?ignore_checksum:bool -> ?max_size:int -> string - -> (string, [> `Gzip of error]) result +val uncompress : + ?max_size:int -> + ?buffer_size:int -> + string -> + (string, [> `Gzip of error ]) result (** [decompress src] decompresses the content from the gzip-compressed [src]. - @param ignore_size may be set to [true] if you want to ignore the expected - decompressed size information in the gzip footer. Defaults to [false]. - @param ignore_checksum may be set to [true] if you want to ignore the - expected decompressed data checksum in the gzip footer. Defaults to - [false]. @param max_size may be used to specify the maximum number of bytes to decompress. Defaults to [Sys.max_string_length]. If [src] decompresses to more than [max_size] bytes then this function will return @@ -40,10 +28,7 @@ val decompress : @return [Ok content] if the decompression was successful @return [Error err] if there was a problem during decompression *) -val pp_error : Format.formatter -> error -> unit - -val pp_gzip_error : Format.formatter -> [`Gzip of error] -> unit - +(* module Z : sig (** {1 zlib compression} *) @@ -58,8 +43,10 @@ module Z : sig representation of [input]. *) val decompress : - ?header:bool -> ?max_size:int -> string - -> (string, [> `Zlib of error]) result + ?header:bool -> + ?max_size:int -> + string -> + (string, [> `Zlib of error ]) result (** [decompress ?header ?max_size input] will return a decompressed representation of [input]. @@ -71,5 +58,6 @@ module Z : sig val pp_error : Format.formatter -> error -> unit - val pp_zlib_error : Format.formatter -> [`Zlib of error] -> unit + val pp_zlib_error : Format.formatter -> [ `Zlib of error ] -> unit end +*) diff --git a/test/dune b/test/dune index 776c83f..b38c46d 100644 --- a/test/dune +++ b/test/dune @@ -1,6 +1,6 @@ (executable (name test) - (libraries alcotest ezgzip qcheck)) + (libraries alcotest ezgzip qcheck-alcotest)) (alias (name runtest) diff --git a/test/test.ml b/test/test.ml index 218aeee..14901cb 100644 --- a/test/test.ml +++ b/test/test.ml @@ -2,38 +2,35 @@ let basic_predefined_checks () = let gzip_error = Alcotest.testable Ezgzip.pp_gzip_error (fun _ _ -> true) in Alcotest.(check (result string gzip_error)) "Round trip - known good" (Ok "hello world") - (Ezgzip.compress "hello world" |> Ezgzip.decompress) ; + (Ezgzip.compress "hello world" |> Ezgzip.uncompress); Alcotest.(check (result string gzip_error)) - "Known bad" (Error (`Gzip (Compression_error "placeholder message"))) - (Ezgzip.decompress "probably not gzip") ; + "Known bad" + (Error (`Gzip (Invalid_content "placeholder message"))) + (Ezgzip.uncompress "probably not gzip"); let big = String.make 1_000_000 'x' in Alcotest.(check (result string gzip_error)) "Big-ish" (Ok big) - (Ezgzip.compress big |> Ezgzip.decompress) ; + (Ezgzip.compress big |> Ezgzip.uncompress); () - let round_trip s = let compressed = Ezgzip.compress s in - match Ezgzip.decompress compressed with + match Ezgzip.uncompress compressed with | Error err -> - Alcotest.failf "Failed to decompress: %a" Ezgzip.pp_gzip_error err + Alcotest.failf "Failed to decompress: %a" Ezgzip.pp_gzip_error err | Ok s' -> Alcotest.equal Alcotest.string s s' - let round_trip_random_cases name count () = QCheck.Test.make ~count ~name QCheck.string round_trip |> QCheck.Test.check_exn - -let predefined_tests = [("predefined cases", `Quick, basic_predefined_checks)] +let predefined_tests = [ ("predefined cases", `Quick, basic_predefined_checks) ] let quickcheck_tests = - [ ( "round-trip" - , `Quick - , round_trip_random_cases "round-trip quickcheck" 1_000 ) ] - + [ + ("round-trip", `Quick, round_trip_random_cases "round-trip quickcheck" 1_000); + ] let () = Alcotest.run "ezgzip" - [("basic", predefined_tests); ("quickcheck", quickcheck_tests)] + [ ("basic", predefined_tests); ("quickcheck", quickcheck_tests) ]