[dbscripts,2/4] Add reproducible archive of packages.

Message ID 20181204180944.9648-2-eschwartz@archlinux.org
State New
Headers show
Series
  • [dbscripts,1/4] tests: make dummy copies of all pkgpool packages in the test environment
Related show

Commit Message

Eli Schwartz via arch-projects Dec. 4, 2018, 6:09 p.m. UTC
Whenever adding new package files to the pool of distributed packages,
hardlink a copy of every package it was built with, into a
"reproducible" pool, and log which file required it.

Signed-off-by: Eli Schwartz <eschwartz@archlinux.org>
---
 config                     |  1 +
 config.local.svn-community |  1 +
 config.local.svn-packages  |  1 +
 db-functions               | 49 +++++++++++++++++++++++++++++++-------
 db-update                  |  4 ++++
 5 files changed, 48 insertions(+), 8 deletions(-)

Comments

Eli Schwartz via arch-projects Dec. 4, 2018, 6:15 p.m. UTC | #1
On 12/4/18 1:09 PM, Eli Schwartz wrote:
> Whenever adding new package files to the pool of distributed packages,
> hardlink a copy of every package it was built with, into a
> "reproducible" pool, and log which file required it.

The question becomes, where can I store these? As-is, this will burden
the mirror network as well. Unsure how to handle this. Could this be
configurable by the mirror, as ISOs are now? Should we exclusively
self-host this, and if so, where?

archive.archlinux.org is managed by another service with its own
exclusively writable location.

> Signed-off-by: Eli Schwartz <eschwartz@archlinux.org>
> ---
>  config                     |  1 +
>  config.local.svn-community |  1 +
>  config.local.svn-packages  |  1 +
>  db-functions               | 49 +++++++++++++++++++++++++++++++-------
>  db-update                  |  4 ++++
>  5 files changed, 48 insertions(+), 8 deletions(-)
> 
> diff --git a/config b/config
> index 1cfe11f4..5144fca7 100644
> --- a/config
> +++ b/config
> @@ -3,6 +3,7 @@
>  FTP_BASE="/srv/ftp"
>  PKGREPOS=()
>  PKGPOOL=''
> +EXTRA_PKGPOOLS=()
>  SRCPOOL=''
>  TESTING_REPO=''
>  STABLE_REPOS=()
> diff --git a/config.local.svn-community b/config.local.svn-community
> index 5d61b5ea..15bcc17f 100644
> --- a/config.local.svn-community
> +++ b/config.local.svn-community
> @@ -2,6 +2,7 @@
>  
>  PKGREPOS=('community' 'community-testing' 'community-staging' 'multilib' 'multilib-testing' 'multilib-staging')
>  PKGPOOL='pool/community'
> +EXTRA_PKGPOOLS=('pool/packages')
>  SRCPOOL='sources/community'
>  SVNREPO='file:///srv/repos/svn-community/svn'
>  SVNUSER='svn-community'
> diff --git a/config.local.svn-packages b/config.local.svn-packages
> index 34aab35c..75986b65 100644
> --- a/config.local.svn-packages
> +++ b/config.local.svn-packages
> @@ -2,6 +2,7 @@
>  
>  PKGREPOS=('core' 'extra' 'testing' 'staging' 'kde-unstable' 'gnome-unstable')
>  PKGPOOL='pool/packages'
> +EXTRA_PKGPOOLS=('pool/community')
>  SRCPOOL='sources/packages'
>  SVNREPO='file:///srv/repos/svn-packages/svn'
>  SVNUSER='svn-packages'
> diff --git a/db-functions b/db-functions
> index 7aeedced..2b1ae87a 100644
> --- a/db-functions
> +++ b/db-functions
> @@ -165,20 +165,23 @@ repo_unlock () { #repo_unlock <repo-name> <arch>
>  	fi
>  }
>  
> +# usage: _grep_all_info pkgfile infofile key
> +_grep_all_info() {
> +	local _ret=()
> +
> +	mapfile -t _ret < <(/usr/bin/bsdtar -xOqf "$1" "${2}" | grep "^${3} = ")
> +
> +	printf '%s\n' "${_ret[@]#${3} = }"
> +}
> +
>  # usage: _grep_pkginfo pkgfile pattern
>  _grep_pkginfo() {
> -	local _ret
> -
> -	_ret="$(/usr/bin/bsdtar -xOqf "$1" .PKGINFO | grep "^${2} = " | tail -1)"
> -	echo "${_ret#${2} = }"
> +	_grep_all_info "${1}" .PKGINFO "${2}" | tail -1
>  }
>  
>  # usage: _grep_buildinfo pkgfile pattern
>  _grep_buildinfo() {
> -	local _ret
> -
> -	_ret="$(/usr/bin/bsdtar -xOqf "$1" .BUILDINFO | grep "^${2} = " | tail -1)"
> -	echo "${_ret#${2} = }"
> +	_grep_all_info "${1}" .BUILDINFO "${2}" | tail -1
>  }
>  
>  # Get the package base or name as fallback
> @@ -444,4 +447,34 @@ arch_repo_modify() {
>  	REPO_MODIFIED=1
>  }
>  
> +# Build an index of dependent packages needed by a given pkgfile
> +# usage: make_reproducible pkgfile [check]
> +make_reproducible() {
> +	local pkg dir pkgs=() pkgfile pkgfiles=()
> +
> +	mapfile -t pkgs < <(_grep_all_info "${1}" .BUILDINFO installed)
> +
> +	for pkg in "${pkgs[@]}"; do
> +		for dir in "${FTP_BASE}/${PKGPOOL}" "${EXTRA_PKGPOOLS[@]/#/${FTP_BASE}/}" "${STAGING}"/**/; do
> +			if pkgfile="$(getpkgfile "${dir}/${pkg}"${PKGEXTS} 2>/dev/null)"; then
> +				pkgfiles+=("${pkgfile}")
> +				continue 2
> +			fi
> +		done
> +		error "could not find existing package for %s" "${pkg}"
> +		return 1
> +	done
> +
> +	if [[ ${2} = check ]]; then
> +		return 0
> +	fi
> +
> +	for pkg in "${pkgfiles[@]}"; do
> +		if [[ ! -f ${FTP_BASE}/${PKGPOOL}-reproducible/${pkg##*/} ]]; then
> +			ln -L "${pkg}" "${FTP_BASE}/${PKGPOOL}-reproducible/${pkg##*/}"
> +		fi
> +		echo "${1}" >> "${FTP_BASE}/${PKGPOOL}-reproducible/${pkg##*/}.buildlinks"
> +	done
> +}
> +
>  . "$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")/db-functions-${VCS}"
> diff --git a/db-update b/db-update
> index 313fb999..11ec185f 100755
> --- a/db-update
> +++ b/db-update
> @@ -61,6 +61,9 @@ for repo in "${repos[@]}"; do
>  			if ! check_builddir "${pkg}"; then
>  				die "Package %s was not built in a chroot" "$repo/${pkg##*/}"
>  			fi
> +			if ! make_reproducible "${pkg}" "check"; then
> +				die "Package %s is not reproducible" "${pkg}"
> +			fi
>  		done
>  		if ! check_splitpkgs "${repo}" "${pkgs[@]}"; then
>  			die "Missing split packages for %s" "$repo"
> @@ -82,6 +85,7 @@ for repo in "${repos[@]}"; do
>  			# any packages might have been moved by the previous run
>  			if [[ -f ${pkg} ]]; then
>  				mv "${pkg}" "$FTP_BASE/${PKGPOOL}"
> +				make_reproducible "${FTP_BASE}/${PKGPOOL}${pkg##*/}"
>  			fi
>  			ln -s "../../../${PKGPOOL}/${pkgfile}" "$FTP_BASE/$repo/os/${pkgarch}"
>  			# also move signatures
>
Eli Schwartz via arch-projects Dec. 5, 2018, 9:49 a.m. UTC | #2
On Tue, Dec 04, 2018 at 01:15:20PM -0500, Eli Schwartz via arch-devops <arch-devops@lists.archlinux.org> wrote:
> On 12/4/18 1:09 PM, Eli Schwartz wrote:
> > Whenever adding new package files to the pool of distributed packages,
> > hardlink a copy of every package it was built with, into a
> > "reproducible" pool, and log which file required it.
> 
> The question becomes, where can I store these? As-is, this will burden
> the mirror network as well. Unsure how to handle this. Could this be
> configurable by the mirror, as ISOs are now? Should we exclusively
> self-host this, and if so, where?

I'm not a fan of adding this pool to the mirror root for multiple
reasons:

 - Most mirrors would likely want to avoid mirroring it because it can
   become quite large and we told them that we only need around 100GB.
   If everyone wants to exclude it that requires action by every admin.
   Not ideal.

 - I'm not sure if all of our mirrors have hardlink support. We don't
   currently ask for it even though we suggest the -H rsync option. Also
   the current repos use symlinks for the packages instead of hardlinks.
   That said, I'm not even sure if rsync can detect hardlinks across
   directories. It can't even detect renames/moves across directories...

 - I don't expect that we need to mirror it because we don't even get
   that many requests to our current archive. If we ever need to mirror
   it, we can worry about that later I'd say since moving it to the
   mirror root should be rather simple.

I'd suggest to make the base path of the repro pool configurable so that
we can keep it out of the mirror root. For now I'd suggest something
like this:

REPRO_BASE="/srv/reproducible-archive/"
pkgname="foo"
pkgfile="foo-1.0-1.pkg.tar.xz"
dest="$REPRO_BASE/packages/${pkgname:0:1}/$pkgname/$pkgfile"
ln .. "$dest"

Also note that this does intentionally not include $PKGPOOL any more
even though you include it in your patch. The archive doesn't have it
and I don't think it really helps anyone. It will just cause confusion
if packages are moved between repos and it makes using the archive more
difficult because the user would have to check all possible pool names
or know which one to check.

Ideally I'd like to later extend this to also include the current
archive's features and from the looks of it, storing the packages like
this is the first step. Then we just need to copy the repos (dbs and pkg
symlinks) once a day and archive the ISOs.

Also thinking about this, it would be great if we could skip the pkg
symlinks for each day's archive and only copy the db itself. All we'd
need is to have a dedicated PackageServer= setting (like Server=, but
only for packages, not for the database) for pacman to find the
packages, but I'm not sure if Allan would like that. That setting would
also have to support the pkgname substring and the pkgname obviously.

Comments/thoughts/patches/... welcome.

Florian
Jelle van der Waa Dec. 12, 2018, 8:55 a.m. UTC | #3
On 12/04/18 at 01:15pm, Eli Schwartz via arch-devops wrote:
> On 12/4/18 1:09 PM, Eli Schwartz wrote:
> > Whenever adding new package files to the pool of distributed packages,
> > hardlink a copy of every package it was built with, into a
> > "reproducible" pool, and log which file required it.

Does this also clean up the archive? As in remove packages which are not
required for reproducible builds? Since now our archive server is almost
running out of space again.

> The question becomes, where can I store these? As-is, this will burden
> the mirror network as well. Unsure how to handle this. Could this be
> configurable by the mirror, as ISOs are now? Should we exclusively
> self-host this, and if so, where?
> 
> archive.archlinux.org is managed by another service with its own
> exclusively writable location.
> 
> > Signed-off-by: Eli Schwartz <eschwartz@archlinux.org>
> > ---
> >  config                     |  1 +
> >  config.local.svn-community |  1 +
> >  config.local.svn-packages  |  1 +
> >  db-functions               | 49 +++++++++++++++++++++++++++++++-------
> >  db-update                  |  4 ++++
> >  5 files changed, 48 insertions(+), 8 deletions(-)
> > 
> > diff --git a/config b/config
> > index 1cfe11f4..5144fca7 100644
> > --- a/config
> > +++ b/config
> > @@ -3,6 +3,7 @@
> >  FTP_BASE="/srv/ftp"
> >  PKGREPOS=()
> >  PKGPOOL=''
> > +EXTRA_PKGPOOLS=()
> >  SRCPOOL=''
> >  TESTING_REPO=''
> >  STABLE_REPOS=()
> > diff --git a/config.local.svn-community b/config.local.svn-community
> > index 5d61b5ea..15bcc17f 100644
> > --- a/config.local.svn-community
> > +++ b/config.local.svn-community
> > @@ -2,6 +2,7 @@
> >  
> >  PKGREPOS=('community' 'community-testing' 'community-staging' 'multilib' 'multilib-testing' 'multilib-staging')
> >  PKGPOOL='pool/community'
> > +EXTRA_PKGPOOLS=('pool/packages')
> >  SRCPOOL='sources/community'
> >  SVNREPO='file:///srv/repos/svn-community/svn'
> >  SVNUSER='svn-community'
> > diff --git a/config.local.svn-packages b/config.local.svn-packages
> > index 34aab35c..75986b65 100644
> > --- a/config.local.svn-packages
> > +++ b/config.local.svn-packages
> > @@ -2,6 +2,7 @@
> >  
> >  PKGREPOS=('core' 'extra' 'testing' 'staging' 'kde-unstable' 'gnome-unstable')
> >  PKGPOOL='pool/packages'
> > +EXTRA_PKGPOOLS=('pool/community')
> >  SRCPOOL='sources/packages'
> >  SVNREPO='file:///srv/repos/svn-packages/svn'
> >  SVNUSER='svn-packages'
> > diff --git a/db-functions b/db-functions
> > index 7aeedced..2b1ae87a 100644
> > --- a/db-functions
> > +++ b/db-functions
> > @@ -165,20 +165,23 @@ repo_unlock () { #repo_unlock <repo-name> <arch>
> >  	fi
> >  }
> >  
> > +# usage: _grep_all_info pkgfile infofile key
> > +_grep_all_info() {
> > +	local _ret=()
> > +
> > +	mapfile -t _ret < <(/usr/bin/bsdtar -xOqf "$1" "${2}" | grep "^${3} = ")
> > +
> > +	printf '%s\n' "${_ret[@]#${3} = }"
> > +}
> > +
> >  # usage: _grep_pkginfo pkgfile pattern
> >  _grep_pkginfo() {
> > -	local _ret
> > -
> > -	_ret="$(/usr/bin/bsdtar -xOqf "$1" .PKGINFO | grep "^${2} = " | tail -1)"
> > -	echo "${_ret#${2} = }"
> > +	_grep_all_info "${1}" .PKGINFO "${2}" | tail -1
> >  }
> >  
> >  # usage: _grep_buildinfo pkgfile pattern
> >  _grep_buildinfo() {
> > -	local _ret
> > -
> > -	_ret="$(/usr/bin/bsdtar -xOqf "$1" .BUILDINFO | grep "^${2} = " | tail -1)"
> > -	echo "${_ret#${2} = }"
> > +	_grep_all_info "${1}" .BUILDINFO "${2}" | tail -1
> >  }
> >  
> >  # Get the package base or name as fallback
> > @@ -444,4 +447,34 @@ arch_repo_modify() {
> >  	REPO_MODIFIED=1
> >  }
> >  
> > +# Build an index of dependent packages needed by a given pkgfile
> > +# usage: make_reproducible pkgfile [check]
> > +make_reproducible() {
> > +	local pkg dir pkgs=() pkgfile pkgfiles=()
> > +
> > +	mapfile -t pkgs < <(_grep_all_info "${1}" .BUILDINFO installed)
> > +
> > +	for pkg in "${pkgs[@]}"; do
> > +		for dir in "${FTP_BASE}/${PKGPOOL}" "${EXTRA_PKGPOOLS[@]/#/${FTP_BASE}/}" "${STAGING}"/**/; do
> > +			if pkgfile="$(getpkgfile "${dir}/${pkg}"${PKGEXTS} 2>/dev/null)"; then
> > +				pkgfiles+=("${pkgfile}")
> > +				continue 2
> > +			fi
> > +		done
> > +		error "could not find existing package for %s" "${pkg}"
> > +		return 1
> > +	done
> > +
> > +	if [[ ${2} = check ]]; then
> > +		return 0
> > +	fi
> > +
> > +	for pkg in "${pkgfiles[@]}"; do
> > +		if [[ ! -f ${FTP_BASE}/${PKGPOOL}-reproducible/${pkg##*/} ]]; then
> > +			ln -L "${pkg}" "${FTP_BASE}/${PKGPOOL}-reproducible/${pkg##*/}"
> > +		fi
> > +		echo "${1}" >> "${FTP_BASE}/${PKGPOOL}-reproducible/${pkg##*/}.buildlinks"
> > +	done
> > +}
> > +
> >  . "$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")/db-functions-${VCS}"
> > diff --git a/db-update b/db-update
> > index 313fb999..11ec185f 100755
> > --- a/db-update
> > +++ b/db-update
> > @@ -61,6 +61,9 @@ for repo in "${repos[@]}"; do
> >  			if ! check_builddir "${pkg}"; then
> >  				die "Package %s was not built in a chroot" "$repo/${pkg##*/}"
> >  			fi
> > +			if ! make_reproducible "${pkg}" "check"; then
> > +				die "Package %s is not reproducible" "${pkg}"
> > +			fi
> >  		done
> >  		if ! check_splitpkgs "${repo}" "${pkgs[@]}"; then
> >  			die "Missing split packages for %s" "$repo"
> > @@ -82,6 +85,7 @@ for repo in "${repos[@]}"; do
> >  			# any packages might have been moved by the previous run
> >  			if [[ -f ${pkg} ]]; then
> >  				mv "${pkg}" "$FTP_BASE/${PKGPOOL}"
> > +				make_reproducible "${FTP_BASE}/${PKGPOOL}${pkg##*/}"
> >  			fi
> >  			ln -s "../../../${PKGPOOL}/${pkgfile}" "$FTP_BASE/$repo/os/${pkgarch}"
> >  			# also move signatures
> > 
> 
> 
> -- 
> Eli Schwartz
> Bug Wrangler and Trusted User
>
Eli Schwartz via arch-projects Dec. 12, 2018, 1:53 p.m. UTC | #4
On 12/12/18 3:55 AM, Jelle van der Waa wrote:
> On 12/04/18 at 01:15pm, Eli Schwartz via arch-devops wrote:
>> On 12/4/18 1:09 PM, Eli Schwartz wrote:
>>> Whenever adding new package files to the pool of distributed packages,
>>> hardlink a copy of every package it was built with, into a
>>> "reproducible" pool, and log which file required it.
> 
> Does this also clean up the archive? As in remove packages which are not
> required for reproducible builds? Since now our archive server is almost
> running out of space again.

Patch 4/4 will do so, but I only cc'ed patch 2/4 to the devops list.

Patch

diff --git a/config b/config
index 1cfe11f4..5144fca7 100644
--- a/config
+++ b/config
@@ -3,6 +3,7 @@ 
 FTP_BASE="/srv/ftp"
 PKGREPOS=()
 PKGPOOL=''
+EXTRA_PKGPOOLS=()
 SRCPOOL=''
 TESTING_REPO=''
 STABLE_REPOS=()
diff --git a/config.local.svn-community b/config.local.svn-community
index 5d61b5ea..15bcc17f 100644
--- a/config.local.svn-community
+++ b/config.local.svn-community
@@ -2,6 +2,7 @@ 
 
 PKGREPOS=('community' 'community-testing' 'community-staging' 'multilib' 'multilib-testing' 'multilib-staging')
 PKGPOOL='pool/community'
+EXTRA_PKGPOOLS=('pool/packages')
 SRCPOOL='sources/community'
 SVNREPO='file:///srv/repos/svn-community/svn'
 SVNUSER='svn-community'
diff --git a/config.local.svn-packages b/config.local.svn-packages
index 34aab35c..75986b65 100644
--- a/config.local.svn-packages
+++ b/config.local.svn-packages
@@ -2,6 +2,7 @@ 
 
 PKGREPOS=('core' 'extra' 'testing' 'staging' 'kde-unstable' 'gnome-unstable')
 PKGPOOL='pool/packages'
+EXTRA_PKGPOOLS=('pool/community')
 SRCPOOL='sources/packages'
 SVNREPO='file:///srv/repos/svn-packages/svn'
 SVNUSER='svn-packages'
diff --git a/db-functions b/db-functions
index 7aeedced..2b1ae87a 100644
--- a/db-functions
+++ b/db-functions
@@ -165,20 +165,23 @@  repo_unlock () { #repo_unlock <repo-name> <arch>
 	fi
 }
 
+# usage: _grep_all_info pkgfile infofile key
+_grep_all_info() {
+	local _ret=()
+
+	mapfile -t _ret < <(/usr/bin/bsdtar -xOqf "$1" "${2}" | grep "^${3} = ")
+
+	printf '%s\n' "${_ret[@]#${3} = }"
+}
+
 # usage: _grep_pkginfo pkgfile pattern
 _grep_pkginfo() {
-	local _ret
-
-	_ret="$(/usr/bin/bsdtar -xOqf "$1" .PKGINFO | grep "^${2} = " | tail -1)"
-	echo "${_ret#${2} = }"
+	_grep_all_info "${1}" .PKGINFO "${2}" | tail -1
 }
 
 # usage: _grep_buildinfo pkgfile pattern
 _grep_buildinfo() {
-	local _ret
-
-	_ret="$(/usr/bin/bsdtar -xOqf "$1" .BUILDINFO | grep "^${2} = " | tail -1)"
-	echo "${_ret#${2} = }"
+	_grep_all_info "${1}" .BUILDINFO "${2}" | tail -1
 }
 
 # Get the package base or name as fallback
@@ -444,4 +447,34 @@  arch_repo_modify() {
 	REPO_MODIFIED=1
 }
 
+# Build an index of dependent packages needed by a given pkgfile
+# usage: make_reproducible pkgfile [check]
+make_reproducible() {
+	local pkg dir pkgs=() pkgfile pkgfiles=()
+
+	mapfile -t pkgs < <(_grep_all_info "${1}" .BUILDINFO installed)
+
+	for pkg in "${pkgs[@]}"; do
+		for dir in "${FTP_BASE}/${PKGPOOL}" "${EXTRA_PKGPOOLS[@]/#/${FTP_BASE}/}" "${STAGING}"/**/; do
+			if pkgfile="$(getpkgfile "${dir}/${pkg}"${PKGEXTS} 2>/dev/null)"; then
+				pkgfiles+=("${pkgfile}")
+				continue 2
+			fi
+		done
+		error "could not find existing package for %s" "${pkg}"
+		return 1
+	done
+
+	if [[ ${2} = check ]]; then
+		return 0
+	fi
+
+	for pkg in "${pkgfiles[@]}"; do
+		if [[ ! -f ${FTP_BASE}/${PKGPOOL}-reproducible/${pkg##*/} ]]; then
+			ln -L "${pkg}" "${FTP_BASE}/${PKGPOOL}-reproducible/${pkg##*/}"
+		fi
+		echo "${1}" >> "${FTP_BASE}/${PKGPOOL}-reproducible/${pkg##*/}.buildlinks"
+	done
+}
+
 . "$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")/db-functions-${VCS}"
diff --git a/db-update b/db-update
index 313fb999..11ec185f 100755
--- a/db-update
+++ b/db-update
@@ -61,6 +61,9 @@  for repo in "${repos[@]}"; do
 			if ! check_builddir "${pkg}"; then
 				die "Package %s was not built in a chroot" "$repo/${pkg##*/}"
 			fi
+			if ! make_reproducible "${pkg}" "check"; then
+				die "Package %s is not reproducible" "${pkg}"
+			fi
 		done
 		if ! check_splitpkgs "${repo}" "${pkgs[@]}"; then
 			die "Missing split packages for %s" "$repo"
@@ -82,6 +85,7 @@  for repo in "${repos[@]}"; do
 			# any packages might have been moved by the previous run
 			if [[ -f ${pkg} ]]; then
 				mv "${pkg}" "$FTP_BASE/${PKGPOOL}"
+				make_reproducible "${FTP_BASE}/${PKGPOOL}${pkg##*/}"
 			fi
 			ln -s "../../../${PKGPOOL}/${pkgfile}" "$FTP_BASE/$repo/os/${pkgarch}"
 			# also move signatures