makepkg: use bsdtar --no-read-sparse for archive creation if available

Message ID 20220310153842.133677-1-diabonas@archlinux.org
State Accepted, archived
Headers show
Series makepkg: use bsdtar --no-read-sparse for archive creation if available | expand

Commit Message

Jonas Witschel March 10, 2022, 3:38 p.m. UTC
bsdtar uses the "pax" TAR archive format by default, which has support for
storing sparse file information in the archive. Unfortunately this is a source
of unreproducibility because the sparse encoding is taken from the file system
and different file systems handle sparse files differently: some file systems
have no support for sparsely encoded files at all, and even file systems with
sparse file support can report different file information for identical files
due to differing implementations.

As a real world example where this happens, consider the Arch Linux package
"brotli-testdata 1.0.9-7", which contains a sparsely encoded all-zeros file
"usr/share/brotli/testdata/zeros". Building this package on a btrfs file system
yields a different package than building it on tmpfs or ext4 solely due to
different sparse file information that gets recorded in the package tarball.

To improve the reproducibility of archives containing sparsely encoded files,
libarchive version 3.6.0 introduces a new --no-read-sparse option. This skips
reading sparse file information from disk entirely and therefore stores files
"expanded" in the archive, which is the only way to make them reliably
reproducible across file systems.

makepkg will use this option if libarchive is recent enough to support it,
which is detected at build time.
---
 build-aux/edit-script.sh.in | 1 +
 meson.build                 | 6 ++++++
 scripts/makepkg.sh.in       | 4 ++--
 3 files changed, 9 insertions(+), 2 deletions(-)

Comments

Allan McRae March 10, 2022, 11:44 p.m. UTC | #1
On 11/3/22 01:38, Jonas Witschel wrote:
> bsdtar uses the "pax" TAR archive format by default, which has support for
> storing sparse file information in the archive. Unfortunately this is a source
> of unreproducibility because the sparse encoding is taken from the file system
> and different file systems handle sparse files differently: some file systems
> have no support for sparsely encoded files at all, and even file systems with
> sparse file support can report different file information for identical files
> due to differing implementations.
> 
> As a real world example where this happens, consider the Arch Linux package
> "brotli-testdata 1.0.9-7", which contains a sparsely encoded all-zeros file
> "usr/share/brotli/testdata/zeros". Building this package on a btrfs file system
> yields a different package than building it on tmpfs or ext4 solely due to
> different sparse file information that gets recorded in the package tarball.
> 
> To improve the reproducibility of archives containing sparsely encoded files,
> libarchive version 3.6.0 introduces a new --no-read-sparse option. This skips
> reading sparse file information from disk entirely and therefore stores files
> "expanded" in the archive, which is the only way to make them reliably
> reproducible across file systems.
> 
> makepkg will use this option if libarchive is recent enough to support it,
> which is detected at build time.
> ---


LGTM.  Thanks.

A

>   build-aux/edit-script.sh.in | 1 +
>   meson.build                 | 6 ++++++
>   scripts/makepkg.sh.in       | 4 ++--
>   3 files changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/build-aux/edit-script.sh.in b/build-aux/edit-script.sh.in
> index 85c56cfe..992033b2 100644
> --- a/build-aux/edit-script.sh.in
> +++ b/build-aux/edit-script.sh.in
> @@ -20,6 +20,7 @@ sed \
>     -e "s|@DEBUGSUFFIX[@]|@DEBUGSUFFIX@|g" \
>     -e "s|@INODECMD[@]|@INODECMD@|g" \
>     -e "s|@FILECMD[@]|@FILECMD@|g" \
> +  -e "s|@BSDTAR_NO_READ_SPARSE[@]|@BSDTAR_NO_READ_SPARSE@|g" \
>     "$input" >"$output"
>   
>   if [[ $mode ]]; then
> diff --git a/meson.build b/meson.build
> index b7cca865..1519a2bb 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -91,6 +91,11 @@ endif
>   libarchive = dependency('libarchive',
>                           version : '>=3.0.0',
>                           static : get_option('buildstatic'))
> +if libarchive.version().version_compare('>=3.6.0')
> +  bsdtar_no_read_sparse = '--no-read-sparse'
> +else
> +  bsdtar_no_read_sparse = ''
> +endif
>   
>   libcurl = dependency('libcurl',
>                        version : '>=7.55.0',
> @@ -274,6 +279,7 @@ substs.set('LIBMAKEPKGDIR', LIBMAKEPKGDIR)
>   substs.set('STRIP_BINARIES', strip_binaries)
>   substs.set('STRIP_SHARED', strip_shared)
>   substs.set('STRIP_STATIC', strip_static)
> +substs.set('BSDTAR_NO_READ_SPARSE', bsdtar_no_read_sparse)
>   
>   subdir('lib/libalpm')
>   subdir('src/common')
> diff --git a/scripts/makepkg.sh.in b/scripts/makepkg.sh.in
> index 5aaabf63..69757d03 100644
> --- a/scripts/makepkg.sh.in
> +++ b/scripts/makepkg.sh.in
> @@ -616,7 +616,7 @@ create_package() {
>   	msg2 "$(gettext "Compressing package...")"
>   	# TODO: Maybe this can be set globally for robustness
>   	shopt -s -o pipefail
> -	list_package_files | LANG=C bsdtar --no-fflags -cnf - --null --files-from - |
> +	list_package_files | LANG=C bsdtar --no-fflags @BSDTAR_NO_READ_SPARSE@ -cnf - --null --files-from - |
>   		compress_as "$PKGEXT" > "${pkg_file}" || ret=$?
>   
>   	shopt -u -o pipefail
> @@ -714,7 +714,7 @@ create_srcpackage() {
>   
>   	# TODO: Maybe this can be set globally for robustness
>   	shopt -s -o pipefail
> -	LANG=C bsdtar --no-fflags -cLf - ${pkgbase} | compress_as "$SRCEXT" > "${pkg_file}" || ret=$?
> +	LANG=C bsdtar --no-fflags @BSDTAR_NO_READ_SPARSE@ -cLf - ${pkgbase} | compress_as "$SRCEXT" > "${pkg_file}" || ret=$?
>   
>   	shopt -u -o pipefail
>

Patch

diff --git a/build-aux/edit-script.sh.in b/build-aux/edit-script.sh.in
index 85c56cfe..992033b2 100644
--- a/build-aux/edit-script.sh.in
+++ b/build-aux/edit-script.sh.in
@@ -20,6 +20,7 @@  sed \
   -e "s|@DEBUGSUFFIX[@]|@DEBUGSUFFIX@|g" \
   -e "s|@INODECMD[@]|@INODECMD@|g" \
   -e "s|@FILECMD[@]|@FILECMD@|g" \
+  -e "s|@BSDTAR_NO_READ_SPARSE[@]|@BSDTAR_NO_READ_SPARSE@|g" \
   "$input" >"$output"
 
 if [[ $mode ]]; then
diff --git a/meson.build b/meson.build
index b7cca865..1519a2bb 100644
--- a/meson.build
+++ b/meson.build
@@ -91,6 +91,11 @@  endif
 libarchive = dependency('libarchive',
                         version : '>=3.0.0',
                         static : get_option('buildstatic'))
+if libarchive.version().version_compare('>=3.6.0')
+  bsdtar_no_read_sparse = '--no-read-sparse'
+else
+  bsdtar_no_read_sparse = ''
+endif
 
 libcurl = dependency('libcurl',
                      version : '>=7.55.0',
@@ -274,6 +279,7 @@  substs.set('LIBMAKEPKGDIR', LIBMAKEPKGDIR)
 substs.set('STRIP_BINARIES', strip_binaries)
 substs.set('STRIP_SHARED', strip_shared)
 substs.set('STRIP_STATIC', strip_static)
+substs.set('BSDTAR_NO_READ_SPARSE', bsdtar_no_read_sparse)
 
 subdir('lib/libalpm')
 subdir('src/common')
diff --git a/scripts/makepkg.sh.in b/scripts/makepkg.sh.in
index 5aaabf63..69757d03 100644
--- a/scripts/makepkg.sh.in
+++ b/scripts/makepkg.sh.in
@@ -616,7 +616,7 @@  create_package() {
 	msg2 "$(gettext "Compressing package...")"
 	# TODO: Maybe this can be set globally for robustness
 	shopt -s -o pipefail
-	list_package_files | LANG=C bsdtar --no-fflags -cnf - --null --files-from - |
+	list_package_files | LANG=C bsdtar --no-fflags @BSDTAR_NO_READ_SPARSE@ -cnf - --null --files-from - |
 		compress_as "$PKGEXT" > "${pkg_file}" || ret=$?
 
 	shopt -u -o pipefail
@@ -714,7 +714,7 @@  create_srcpackage() {
 
 	# TODO: Maybe this can be set globally for robustness
 	shopt -s -o pipefail
-	LANG=C bsdtar --no-fflags -cLf - ${pkgbase} | compress_as "$SRCEXT" > "${pkg_file}" || ret=$?
+	LANG=C bsdtar --no-fflags @BSDTAR_NO_READ_SPARSE@ -cLf - ${pkgbase} | compress_as "$SRCEXT" > "${pkg_file}" || ret=$?
 
 	shopt -u -o pipefail