Skip to content

Commit a6eb859

Browse files
committed
pkg: rewrite archive extraction with typed tar capabilities
We rewrite the extraction capabilities for package management, enforcing the invariants we wish to preserve at the type level. The aim is to separate the concerns of finding an appropriate archive binary, checking the capabilities of this binary, creating arguments for this binary and invoking this for extraction. The public API of extraction is still simple, so the typed invariants are more of an internal impelementation detail. A sumamry of the changes are as follows: - Use GADTs to encode which formats each tar implementation supports, preventing invalid combinations at compile time - Detect tar implementation (BSD/GNU/Other) via --version output - Prefer BSD tar over unzip for zip extraction - Check bsdtar before tar to get more capable implementation - Add gtar to binary list for GNU tar on macOS/BSD - Pass explicit -z/-j flags for OpenBSD/busybox tar Signed-off-by: Ali Caglayan <[email protected]>
1 parent 7437932 commit a6eb859

File tree

12 files changed

+532
-241
lines changed

12 files changed

+532
-241
lines changed

src/dune_pkg/archive.ml

Lines changed: 362 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,362 @@
1+
open Stdune
2+
module Process = Dune_engine.Process
3+
open Fiber.O
4+
5+
(** {1 Archive Extraction}
6+
7+
This module handles extraction of .tar, .tar.gz, .tar.bz2, and .zip archives
8+
using system tools (tar, bsdtar, unzip). The main complexity comes from
9+
supporting different tar implementations with varying capabilities:
10+
11+
- BSD tar (libarchive): Auto-detects compression, supports zip extraction
12+
- GNU tar: Auto-detects compression, no zip support
13+
- Other (OpenBSD, busybox): Requires explicit -z/-j flags, no zip support
14+
15+
We use GADTs with polymorphic variants to encode these capabilities at the
16+
type level. [Tar.t] has a type parameter indicating which formats it supports,
17+
and [Tar.args] can only be called with formats valid for that tar. This
18+
prevents invalid combinations (like extracting zip with GNU tar) at compile
19+
time rather than runtime.
20+
21+
For zip extraction, we prefer BSD tar when available, falling back to the
22+
[unzip] binary if BSD tar is not found. *)
23+
24+
module Format : sig
25+
(** Supported archive formats:
26+
- [`Tar]: uncompressed tar (.tar)
27+
- [`Tar_gz]: gzip-compressed tar (.tar.gz, .tgz)
28+
- [`Tar_bz2]: bzip2-compressed tar (.tar.bz2, .tbz)
29+
- [`Zip]: zip archive (.zip) *)
30+
type t =
31+
[ `Tar
32+
| `Tar_gz
33+
| `Tar_bz2
34+
| `Zip
35+
]
36+
37+
(** Detect archive format from filename extension. Returns the format and
38+
the matched extension. *)
39+
val of_filename : Filename.t -> (t * string) option
40+
end = struct
41+
type t =
42+
[ `Tar
43+
| `Tar_gz
44+
| `Tar_bz2
45+
| `Zip
46+
]
47+
48+
let of_filename =
49+
let extensions =
50+
[ ".tar", `Tar
51+
; ".tar.gz", `Tar_gz
52+
; ".tgz", `Tar_gz
53+
; ".tar.bz2", `Tar_bz2
54+
; ".tbz", `Tar_bz2
55+
; ".zip", `Zip
56+
]
57+
in
58+
fun filename ->
59+
let check_suffix suffix = Filename.check_suffix filename suffix in
60+
List.find_map extensions ~f:(fun (ext, format) ->
61+
Option.some_if (check_suffix ext) (format, ext))
62+
;;
63+
end
64+
65+
let is_supported filename = Option.is_some (Format.of_filename filename)
66+
let which bin_name = Bin.which ~path:(Env_path.path Env.initial) bin_name
67+
68+
module Error = struct
69+
type t =
70+
| No_extractor of
71+
{ ext : string
72+
; tried : string list
73+
}
74+
| Command_failed of
75+
{ bin : Path.t
76+
; archive : Path.t
77+
; exit_code : int
78+
; stderr : string list
79+
}
80+
| Read_dir_failed of
81+
{ archive : Path.t
82+
; error : Unix_error.Detailed.t
83+
}
84+
85+
let message = function
86+
| No_extractor { ext; tried } ->
87+
[ Pp.textf "No program found to extract %s files. Tried:" ext
88+
; Pp.enumerate tried ~f:Pp.verbatim
89+
]
90+
| Command_failed { bin; archive; exit_code; stderr } ->
91+
[ Pp.textf "Failed to extract '%s':" (Path.basename archive)
92+
; Pp.hovbox
93+
(Pp.concat
94+
~sep:Pp.space
95+
[ User_message.command (Path.basename bin)
96+
; Pp.textf "failed with non-zero exit code '%d' and output:" exit_code
97+
])
98+
; Pp.vbox (Pp.concat_map ~sep:Pp.cut ~f:Pp.paragraph stderr)
99+
]
100+
| Read_dir_failed { archive; error } ->
101+
[ Pp.textf "Failed to extract '%s':" (Path.to_string_maybe_quoted archive)
102+
; Pp.text (Unix_error.Detailed.to_string_hum error)
103+
]
104+
;;
105+
106+
let raise t = User_error.raise (message t)
107+
end
108+
109+
module Tar : sig
110+
(** Tar executable for current system. The type parameter encodes which
111+
formats this tar supports - pattern match on [found] to get the right
112+
type.
113+
114+
{v
115+
| Kind | Implementations | Detects compression | Zip support |
116+
|-------|--------------------------|---------------------|-------------|
117+
| Bsd | bsdtar, libarchive | yes | yes |
118+
| | (macOS, FreeBSD, Win10+) | | |
119+
| | | | |
120+
| Gnu | GNU tar (most Linux) | yes | no |
121+
| | | | |
122+
| Other | OpenBSD tar, busybox tar | no | no |
123+
v}
124+
125+
[Other] is the fallback for unknown implementations. We use explicit
126+
[-z]/[-j] flags which is the safest approach. *)
127+
type 'fmt t
128+
129+
(** Archive formats supported by BSD tar. *)
130+
type bsd =
131+
[ `Tar
132+
| `Tar_gz
133+
| `Tar_bz2
134+
| `Zip
135+
]
136+
137+
(** Result of finding tar. Pattern match to get the appropriately typed tar. *)
138+
type found =
139+
| Bsd of bsd t
140+
| Gnu of [ `Tar | `Tar_gz | `Tar_bz2 ] t
141+
| Other of [ `Tar | `Tar_gz | `Tar_bz2 ] t
142+
143+
(** Attempt to find tar executable for current system. *)
144+
val find : found option Fiber.Lazy.t
145+
146+
(** Names of binaries checked when attempting to find tar executable. *)
147+
val bin_names : string list
148+
149+
(** Path to tar executable. *)
150+
val path : 'a t -> Path.t
151+
152+
(** [args t format ~archive ~target] provides the arguments for extraction.
153+
The [format] must be in the set of formats supported by [t]. *)
154+
val args : 'fmt t -> format:'fmt -> archive:Path.t -> target:Path.t -> string list
155+
end = struct
156+
type _ kind =
157+
| K_bsd : [ `Tar | `Tar_gz | `Tar_bz2 | `Zip ] kind
158+
| K_gnu : [ `Tar | `Tar_gz | `Tar_bz2 ] kind
159+
| K_other : [ `Tar | `Tar_gz | `Tar_bz2 ] kind
160+
161+
type 'fmt t =
162+
{ path : Path.t
163+
; kind : 'fmt kind
164+
}
165+
166+
type bsd =
167+
[ `Tar
168+
| `Tar_gz
169+
| `Tar_bz2
170+
| `Zip
171+
]
172+
173+
type found =
174+
| Bsd of bsd t
175+
| Gnu of [ `Tar | `Tar_gz | `Tar_bz2 ] t
176+
| Other of [ `Tar | `Tar_gz | `Tar_bz2 ] t
177+
178+
(** The order of binaries we should try from most capable to least + Windows. *)
179+
let bin_names = [ "bsdtar"; "tar"; "gtar"; "tar.exe" ]
180+
181+
let find =
182+
Fiber.Lazy.create (fun () ->
183+
match List.find_map bin_names ~f:which with
184+
| None -> Fiber.return None
185+
| Some path ->
186+
let+ output, _ = Process.run_capture ~display:Quiet Return path [ "--version" ] in
187+
let matches s = Re.execp (Re.compile (Re.str s)) output in
188+
Some
189+
(if matches "bsdtar" || matches "libarchive"
190+
then Bsd { path; kind = K_bsd }
191+
else if matches "GNU tar"
192+
then Gnu { path; kind = K_gnu }
193+
else Other { path; kind = K_other }))
194+
;;
195+
196+
let path t = t.path
197+
198+
let args (type fmt) (t : fmt t) ~(format : fmt) ~archive ~target =
199+
let decompress_flag =
200+
match t.kind, format with
201+
| K_bsd, (`Tar | `Tar_gz | `Tar_bz2 | `Zip) -> []
202+
| K_gnu, (`Tar | `Tar_gz | `Tar_bz2) -> []
203+
| K_other, `Tar -> []
204+
| K_other, `Tar_gz -> [ "-z" ]
205+
| K_other, `Tar_bz2 -> [ "-j" ]
206+
in
207+
[ "-x" ]
208+
@ decompress_flag
209+
@ [ "-f"; Path.to_string archive; "-C"; Path.to_string target ]
210+
;;
211+
end
212+
213+
module Unzip : sig
214+
(** A binary that can extract zip files. *)
215+
type t
216+
217+
(** Find a binary that can extract zip files. *)
218+
val find : t option Fiber.Lazy.t
219+
220+
(** Names of binaries checked when attempting to find binary that can extract
221+
zip files. *)
222+
val bin_names : string list
223+
224+
(** Path to binary that can extract zip files. *)
225+
val path : t -> Path.t
226+
227+
(** Arguments given to binary that can extract zip files. *)
228+
val args : t -> archive:Path.t -> target:Path.t -> string list
229+
end = struct
230+
type t =
231+
| Unzip of Path.t
232+
| Tar of Tar.bsd Tar.t
233+
234+
let bin_names = Tar.bin_names @ [ "unzip" ]
235+
236+
let find =
237+
(* We first check if we have a tar binary capable of unzipping, if not we
238+
fallback to unzip. *)
239+
Fiber.Lazy.create (fun () ->
240+
Fiber.Lazy.force Tar.find
241+
>>| function
242+
| Some (Bsd tar) -> Some (Tar tar)
243+
| Some (Gnu _ | Other _) | None ->
244+
(match which "unzip" with
245+
| Some path -> Some (Unzip path)
246+
| None -> None))
247+
;;
248+
249+
let path = function
250+
| Unzip path -> path
251+
| Tar tar -> Tar.path tar
252+
;;
253+
254+
let args t ~archive ~target =
255+
match t with
256+
| Unzip _ -> [ Path.to_string archive; "-d"; Path.to_string target ]
257+
| Tar tar -> Tar.args tar ~format:`Zip ~archive ~target
258+
;;
259+
end
260+
261+
module Extractor : sig
262+
(** [Extractor.run ~archive ~target] extracts the given [archive] using an
263+
appropriate tool to do so. *)
264+
val run : archive:Path.t -> target:Path.t -> (unit, Error.t) result Fiber.t
265+
end = struct
266+
let no_extractor_error ~ext ~bin_names =
267+
Fiber.return @@ Error (Error.No_extractor { ext; tried = bin_names })
268+
;;
269+
270+
let run_cmd ~bin ~args ~archive =
271+
let temp_stderr = Temp.create File ~prefix:"extract" ~suffix:"stderr" in
272+
Fiber.finalize ~finally:(fun () ->
273+
Temp.destroy File temp_stderr;
274+
Fiber.return ())
275+
@@ fun () ->
276+
let+ (), exit_code =
277+
Process.run
278+
~display:Quiet
279+
~stdout_to:
280+
(Process.Io.make_stdout ~output_on_success:Swallow ~output_limit:1_000_000)
281+
~stderr_to:(Process.Io.file temp_stderr Out)
282+
Return
283+
bin
284+
args
285+
in
286+
if exit_code <> 0
287+
then
288+
Io.with_file_in temp_stderr ~f:(fun err_channel ->
289+
let stderr = Io.input_lines err_channel in
290+
Error (Error.Command_failed { bin; archive; exit_code; stderr }))
291+
else Ok ()
292+
;;
293+
294+
let run ~archive ~target =
295+
let format, ext =
296+
Format.of_filename (Path.to_string archive) |> Option.value ~default:(`Tar, ".tar")
297+
in
298+
match (format : Format.t) with
299+
| `Zip ->
300+
Fiber.Lazy.force Unzip.find
301+
>>= (function
302+
| Some unzip ->
303+
let bin = Unzip.path unzip in
304+
let args = Unzip.args unzip ~archive ~target in
305+
run_cmd ~bin ~args ~archive
306+
| None -> no_extractor_error ~ext ~bin_names:Unzip.bin_names)
307+
| (`Tar | `Tar_gz | `Tar_bz2) as format ->
308+
Fiber.Lazy.force Tar.find
309+
>>= (function
310+
| Some (Bsd tar) ->
311+
let bin = Tar.path tar in
312+
let args = Tar.args tar ~format ~archive ~target in
313+
run_cmd ~bin ~args ~archive
314+
| Some (Gnu tar | Other tar) ->
315+
let bin = Tar.path tar in
316+
let args = Tar.args tar ~format ~archive ~target in
317+
run_cmd ~bin ~args ~archive
318+
| None -> no_extractor_error ~ext ~bin_names:Tar.bin_names)
319+
;;
320+
end
321+
322+
(** Make extraction atomic by extracting to a temp directory and renaming. *)
323+
let extract ~archive ~target =
324+
let target_in_temp =
325+
let prefix = Path.basename target in
326+
let suffix = Path.basename archive in
327+
Temp_dir.dir_for_target ~target ~prefix ~suffix
328+
in
329+
Fiber.finalize ~finally:(fun () ->
330+
Temp.destroy Dir target_in_temp;
331+
Fiber.return ())
332+
@@ fun () ->
333+
Path.mkdir_p target_in_temp;
334+
Extractor.run ~archive ~target:target_in_temp
335+
>>| function
336+
| Error _ as e -> e
337+
| Ok () ->
338+
(* Opam expects only one top-level directory and then strips it. We employ a
339+
similar heuristic here. Opam is stricter in the multiple top-level entries
340+
case since it errors, whereas we use the extraction directory as-is. *)
341+
let target_in_temp =
342+
match Path.readdir_unsorted_with_kinds target_in_temp with
343+
| Error error -> Error (Error.Read_dir_failed { archive; error })
344+
| Ok [ (fname, S_DIR) ] -> Ok (Path.relative target_in_temp fname)
345+
| Ok _ -> Ok target_in_temp
346+
in
347+
(match target_in_temp with
348+
| Error _ as e -> e
349+
| Ok target_in_temp ->
350+
(* CR-soon Alizter: this might already exist, don't do pointeless work *)
351+
Path.mkdir_p (Path.parent_exn target);
352+
(* CR-someday Alizter: Add fallback to copy if EXDEV raised here *)
353+
Path.rename target_in_temp target;
354+
Ok ())
355+
;;
356+
357+
let extract_exn ~archive ~target =
358+
extract ~archive ~target
359+
>>| function
360+
| Ok () -> ()
361+
| Error e -> Error.raise e
362+
;;

src/dune_pkg/archive.mli

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
open Stdune
2+
3+
(** Returns [true] if the filename has a supported archive extension
4+
(.tar, .tar.gz, .tgz, .tar.bz2, .tbz, .zip). *)
5+
val is_supported : Filename.t -> bool
6+
7+
module Error : sig
8+
type t = private
9+
| No_extractor of
10+
{ ext : string
11+
; tried : string list
12+
}
13+
| Command_failed of
14+
{ bin : Path.t
15+
; archive : Path.t
16+
; exit_code : int
17+
; stderr : string list
18+
}
19+
| Read_dir_failed of
20+
{ archive : Path.t
21+
; error : Unix_error.Detailed.t
22+
}
23+
24+
val message : t -> User_message.Style.t Pp.t list
25+
val raise : t -> 'a
26+
end
27+
28+
(** [extract ~archive ~target] extracts the archive at [archive] into the
29+
directory at [target], creating the directory if it doesn't already exist.
30+
The archive format is determined from the file extension. If the extension
31+
is not recognized, defaults to tar. *)
32+
val extract : archive:Path.t -> target:Path.t -> (unit, Error.t) result Fiber.t
33+
34+
(** Same as [extract] but raises [User_error] on failure. *)
35+
val extract_exn : archive:Path.t -> target:Path.t -> unit Fiber.t

0 commit comments

Comments
 (0)