[pacman-dev] Introduce alpm_dbs_update() function for parallel db updates

Message ID 20200306203502.202482-1-anatol.pomozov@gmail.com
State Superseded, archived
Headers show
Series [pacman-dev] Introduce alpm_dbs_update() function for parallel db updates | expand

Commit Message

Anatol Pomozov March 6, 2020, 8:35 p.m. UTC
This is an equivalent of alpm_db_update but for multiplexed (parallel)
download. The difference is that this function accepts list of
databases to update. And then ALPM internals download it in parallel if
possible.

Add a stub for _alpm_multi_download the function that will do parallel
payloads downloads in the future.

Introduce dload_payload->filepath field that contains url path to the
file we download. It is like fileurl field but does not contain
protocol/server part. The rationale for having this field is that with
the curl multidownload the server retry logic is going to move to a curl
callback. And the callback needs to be able to reconstruct the 'next'
fileurl. One will be able to do it by getting the next server url from
'servers' list and then concat with filepath. Once the 'parallel download'
refactoring is over 'fileurl' field will go away.

Signed-off-by: Anatol Pomozov <anatol.pomozov@gmail.com>
---
 lib/libalpm/alpm.h    |   2 +
 lib/libalpm/be_sync.c | 132 ++++++++++++++++++++++++++++++++++++++++++
 lib/libalpm/dload.c   |  12 ++++
 lib/libalpm/dload.h   |   5 ++
 4 files changed, 151 insertions(+)

Comments

Anatol Pomozov March 6, 2020, 8:48 p.m. UTC | #1
Hi

On Fri, Mar 6, 2020 at 12:35 PM Anatol Pomozov <anatol.pomozov@gmail.com> wrote:
>
> This is an equivalent of alpm_db_update but for multiplexed (parallel)
> download. The difference is that this function accepts list of
> databases to update. And then ALPM internals download it in parallel if
> possible.
>
> Add a stub for _alpm_multi_download the function that will do parallel
> payloads downloads in the future.
>
> Introduce dload_payload->filepath field that contains url path to the
> file we download. It is like fileurl field but does not contain
> protocol/server part. The rationale for having this field is that with
> the curl multidownload the server retry logic is going to move to a curl
> callback. And the callback needs to be able to reconstruct the 'next'
> fileurl. One will be able to do it by getting the next server url from
> 'servers' list and then concat with filepath. Once the 'parallel download'
> refactoring is over 'fileurl' field will go away.
>
> Signed-off-by: Anatol Pomozov <anatol.pomozov@gmail.com>
> ---
>  lib/libalpm/alpm.h    |   2 +
>  lib/libalpm/be_sync.c | 132 ++++++++++++++++++++++++++++++++++++++++++
>  lib/libalpm/dload.c   |  12 ++++
>  lib/libalpm/dload.h   |   5 ++
>  4 files changed, 151 insertions(+)
>
> diff --git a/lib/libalpm/alpm.h b/lib/libalpm/alpm.h
> index 93b97f44..eb0490eb 100644
> --- a/lib/libalpm/alpm.h
> +++ b/lib/libalpm/alpm.h
> @@ -1045,6 +1045,8 @@ int alpm_db_remove_server(alpm_db_t *db, const char *url);
>   */
>  int alpm_db_update(int force, alpm_db_t *db);
>
> +int alpm_dbs_update(alpm_handle_t *handle, alpm_list_t *dbs, int force, int failfast);
> +
>  /** Get a package entry from a package database.
>   * @param db pointer to the package database to get the package from
>   * @param name of the package
> diff --git a/lib/libalpm/be_sync.c b/lib/libalpm/be_sync.c
> index aafed15d..cdb46bd9 100644
> --- a/lib/libalpm/be_sync.c
> +++ b/lib/libalpm/be_sync.c
> @@ -301,6 +301,138 @@ int SYMEXPORT alpm_db_update(int force, alpm_db_t *db)
>         return ret;
>  }
>
> +/** Update list of databases. This function may run updates in parallel.
> + *
> + * @param dbs a list of alpm_db_t to update.
> + */
> +int SYMEXPORT alpm_dbs_update(alpm_handle_t *handle, alpm_list_t *dbs, int force, UNUSED int failfast) {

One question I had initially is whether we need 'failfast' option for
parallel downloads. Failfast means once any error is detected ALPM
stops all the download streams right away and returns error back to
the client.

But now I am not sure if this option will be really useful. Currently
multi-download implementation waits for download transactions to
finish and only then returns to the client. It works just fine.

So I am thinking to remove failfast option unless someone has a strong
opinion we should keep it.

> +       char *syncpath;
> +       const char *dbext = handle->dbext;
> +       alpm_list_t *i;
> +       int ret = -1;
> +       mode_t oldmask;
> +       alpm_list_t *payloads = NULL;
> +
> +       /* Sanity checks */
> +       ASSERT(dbs != NULL, return -1);
> +       handle->pm_errno = ALPM_ERR_OK;
> +
> +       syncpath = get_sync_dir(handle);
> +       ASSERT(syncpath != NULL, return -1);
> +
> +       /* make sure we have a sane umask */
> +       oldmask = umask(0022);
> +
> +       for(i = dbs; i; i = i->next) {
> +               alpm_db_t *db = i->data;
> +               int dbforce = force;
> +               struct dload_payload *payload = NULL;
> +               size_t len;
> +               int siglevel;
> +
> +               if(!(db->usage & ALPM_DB_USAGE_SYNC)) {
> +                       continue;
> +               }
> +
> +               ASSERT(db != handle->db_local, RET_ERR(handle, ALPM_ERR_WRONG_ARGS, -1));
> +               ASSERT(db->servers != NULL, RET_ERR(handle, ALPM_ERR_SERVER_NONE, -1));
> +
> +               /* force update of invalid databases to fix potential mismatched database/signature */
> +               if(db->status & DB_STATUS_INVALID) {
> +                       dbforce = 1;
> +               }
> +
> +               CALLOC(payload, 1, sizeof(*payload), RET_ERR(handle, ALPM_ERR_MEMORY, -1));
> +
> +               /* set hard upper limit of 128MiB */
> +               payload->max_size = 128 * 1024 * 1024;
> +               ASSERT(db->servers != NULL, RET_ERR(handle, ALPM_ERR_SERVER_NONE, -1));
> +               payload->servers = db->servers;
> +
> +               /* print server + filename into a buffer */
> +               len = strlen(db->treename) + strlen(dbext) + 1;
> +               MALLOC(payload->filepath, len, GOTO_ERR(handle, ALPM_ERR_MEMORY, cleanup));
> +               snprintf(payload->filepath, len, "%s%s", db->treename, dbext);
> +               payload->handle = handle;
> +               payload->force = dbforce;
> +               payload->unlink_on_fail = 1;
> +
> +               payloads = alpm_list_add(payloads, payload);
> +
> +               siglevel = alpm_db_get_siglevel(db);
> +               if(siglevel & ALPM_SIG_DATABASE) {
> +                       struct dload_payload *sig_payload;
> +                       CALLOC(sig_payload, 1, sizeof(*sig_payload), GOTO_ERR(handle, ALPM_ERR_MEMORY, cleanup));
> +
> +                       /* print filename into a buffer (leave space for separator and .sig) */
> +                       len = strlen(db->treename) + strlen(dbext) + 5;
> +                       MALLOC(sig_payload->filepath, len, goto cleanup);
> +                       snprintf(sig_payload->filepath, len, "%s%s.sig", db->treename, dbext);
> +
> +                       sig_payload->handle = handle;
> +                       sig_payload->force = dbforce;
> +                       sig_payload->errors_ok = (siglevel & ALPM_SIG_DATABASE_OPTIONAL);
> +
> +                       /* set hard upper limit of 16KiB */
> +                       sig_payload->max_size = 16 * 1024;
> +                       sig_payload->servers = db->servers;
> +
> +                       payloads = alpm_list_add(payloads, sig_payload);
> +               }
> +       }
> +
> +       /* attempt to grab a lock */
> +       if(_alpm_handle_lock(handle)) {
> +               GOTO_ERR(handle, ALPM_ERR_HANDLE_LOCK, cleanup);
> +       }
> +
> +       ret = _alpm_multi_download(handle, payloads, syncpath);
> +       if(ret < 0) {
> +               goto cleanup;
> +       }
> +
> +       for(i = dbs; i; i = i->next) {
> +               alpm_db_t *db = i->data;
> +               if(!(db->usage & ALPM_DB_USAGE_SYNC)) {
> +                       continue;
> +               }
> +
> +               /* Cache needs to be rebuilt */
> +               _alpm_db_free_pkgcache(db);
> +
> +               /* clear all status flags regarding validity/existence */
> +               db->status &= ~DB_STATUS_VALID;
> +               db->status &= ~DB_STATUS_INVALID;
> +               db->status &= ~DB_STATUS_EXISTS;
> +               db->status &= ~DB_STATUS_MISSING;
> +
> +               /* if the download failed skip validation to preserve the download error */
> +               if(sync_db_validate(db) != 0) {
> +                       /* pm_errno should be set */
> +                       ret = -1;
> +               }
> +       }
> +
> +cleanup:
> +       _alpm_handle_unlock(handle);
> +
> +       if(ret == -1) {
> +               /* pm_errno was set by the download code */
> +               _alpm_log(handle, ALPM_LOG_DEBUG, "failed to sync dbs: %s\n",
> +                               alpm_strerror(handle->pm_errno));
> +       } else {
> +               handle->pm_errno = ALPM_ERR_OK;
> +       }
> +
> +       if(payloads) {
> +               alpm_list_free_inner(payloads, (alpm_list_fn_free)_alpm_dload_payload_reset);
> +               FREELIST(payloads);
> +       }
> +       free(syncpath);
> +       umask(oldmask);
> +       return ret;
> +}
> +
>  /* Forward decl so I don't reorganize the whole file right now */
>  static int sync_db_read(alpm_db_t *db, struct archive *archive,
>                 struct archive_entry *entry, alpm_pkg_t **likely_pkg);
> diff --git a/lib/libalpm/dload.c b/lib/libalpm/dload.c
> index 670da03d..7cd3e3a4 100644
> --- a/lib/libalpm/dload.c
> +++ b/lib/libalpm/dload.c
> @@ -636,6 +636,16 @@ int _alpm_download(struct dload_payload *payload, const char *localpath,
>         }
>  }
>
> +int _alpm_multi_download(alpm_handle_t *handle,
> +               alpm_list_t *payloads /* struct dload_payload */,
> +               const char *localpath)
> +{
> +       (void)handle;
> +       (void)payloads;
> +       (void)localpath;
> +       return 0;
> +}
> +
>  static char *filecache_find_url(alpm_handle_t *handle, const char *url)
>  {
>         const char *filebase = strrchr(url, '/');
> @@ -738,6 +748,7 @@ void _alpm_dload_payload_reset(struct dload_payload *payload)
>         FREE(payload->destfile_name);
>         FREE(payload->content_disp_name);
>         FREE(payload->fileurl);
> +       FREE(payload->filepath);
>         *payload = (struct dload_payload){0};
>  }
>
> @@ -746,6 +757,7 @@ void _alpm_dload_payload_reset_for_retry(struct dload_payload *payload)
>         ASSERT(payload, return);
>
>         FREE(payload->fileurl);
> +       FREE(payload->filepath);
>         payload->initial_size += payload->prevprogress;
>         payload->prevprogress = 0;
>         payload->unlink_on_fail = 0;
> diff --git a/lib/libalpm/dload.h b/lib/libalpm/dload.h
> index 1e8f75f3..3eb7fbe1 100644
> --- a/lib/libalpm/dload.h
> +++ b/lib/libalpm/dload.h
> @@ -31,6 +31,7 @@ struct dload_payload {
>         char *destfile_name;
>         char *content_disp_name;
>         char *fileurl;
> +       char *filepath; /* download URL path */
>         alpm_list_t *servers;
>         long respcode;
>         off_t initial_size;
> @@ -53,4 +54,8 @@ void _alpm_dload_payload_reset_for_retry(struct dload_payload *payload);
>  int _alpm_download(struct dload_payload *payload, const char *localpath,
>                 char **final_file, const char **final_url);
>
> +int _alpm_multi_download(alpm_handle_t *handle,
> +               alpm_list_t *payloads /* struct dload_payload */,
> +               const char *localpath);
> +
>  #endif /* ALPM_DLOAD_H */
> --
> 2.25.1
>
Allan McRae March 8, 2020, 5:23 a.m. UTC | #2
On 7/3/20 6:48 am, Anatol Pomozov wrote:
> Hi
> 
> On Fri, Mar 6, 2020 at 12:35 PM Anatol Pomozov <anatol.pomozov@gmail.com> wrote:
>>
>> This is an equivalent of alpm_db_update but for multiplexed (parallel)
>> download. The difference is that this function accepts list of
>> databases to update. And then ALPM internals download it in parallel if
>> possible.
>>
>> Add a stub for _alpm_multi_download the function that will do parallel
>> payloads downloads in the future.
>>
>> Introduce dload_payload->filepath field that contains url path to the
>> file we download. It is like fileurl field but does not contain
>> protocol/server part. The rationale for having this field is that with
>> the curl multidownload the server retry logic is going to move to a curl
>> callback. And the callback needs to be able to reconstruct the 'next'
>> fileurl. One will be able to do it by getting the next server url from
>> 'servers' list and then concat with filepath. Once the 'parallel download'
>> refactoring is over 'fileurl' field will go away.
>>
>> Signed-off-by: Anatol Pomozov <anatol.pomozov@gmail.com>
>> ---
>>  lib/libalpm/alpm.h    |   2 +
>>  lib/libalpm/be_sync.c | 132 ++++++++++++++++++++++++++++++++++++++++++
>>  lib/libalpm/dload.c   |  12 ++++
>>  lib/libalpm/dload.h   |   5 ++
>>  4 files changed, 151 insertions(+)
>>
>> diff --git a/lib/libalpm/alpm.h b/lib/libalpm/alpm.h
>> index 93b97f44..eb0490eb 100644
>> --- a/lib/libalpm/alpm.h
>> +++ b/lib/libalpm/alpm.h
>> @@ -1045,6 +1045,8 @@ int alpm_db_remove_server(alpm_db_t *db, const char *url);
>>   */
>>  int alpm_db_update(int force, alpm_db_t *db);
>>
>> +int alpm_dbs_update(alpm_handle_t *handle, alpm_list_t *dbs, int force, int failfast);
>> +
>>  /** Get a package entry from a package database.
>>   * @param db pointer to the package database to get the package from
>>   * @param name of the package
>> diff --git a/lib/libalpm/be_sync.c b/lib/libalpm/be_sync.c
>> index aafed15d..cdb46bd9 100644
>> --- a/lib/libalpm/be_sync.c
>> +++ b/lib/libalpm/be_sync.c
>> @@ -301,6 +301,138 @@ int SYMEXPORT alpm_db_update(int force, alpm_db_t *db)
>>         return ret;
>>  }
>>
>> +/** Update list of databases. This function may run updates in parallel.
>> + *
>> + * @param dbs a list of alpm_db_t to update.
>> + */
>> +int SYMEXPORT alpm_dbs_update(alpm_handle_t *handle, alpm_list_t *dbs, int force, UNUSED int failfast) {
> 
> One question I had initially is whether we need 'failfast' option for
> parallel downloads. Failfast means once any error is detected ALPM
> stops all the download streams right away and returns error back to
> the client.
> 
> But now I am not sure if this option will be really useful. Currently
> multi-download implementation waits for download transactions to
> finish and only then returns to the client. It works just fine.
> 
> So I am thinking to remove failfast option unless someone has a strong
> opinion we should keep it.
> 

We don't want failfast for databases.  At least until databases are
downloaded into a temporary location.  Failfast would leave us with a
bunch of half downloaded database files.

A
Allan McRae March 8, 2020, 1:04 p.m. UTC | #3
On 7/3/20 6:35 am, Anatol Pomozov wrote:
> This is an equivalent of alpm_db_update but for multiplexed (parallel)
> download. The difference is that this function accepts list of
> databases to update. And then ALPM internals download it in parallel if
> possible.
> 
> Add a stub for _alpm_multi_download the function that will do parallel
> payloads downloads in the future.

Thanks for splitting the patches up into smaller units.  It makes it a
lot easier for me to review.

Note that once review is complete, this patch (and the config one) will
sit on a branch until the rest of the work is ready to land in one go.

In fact... it probably makes sense to add _alpm_multi_download first,
then follow-up with this patch.

> Introduce dload_payload->filepath field that contains url path to the
> file we download. It is like fileurl field but does not contain
> protocol/server part. The rationale for having this field is that with
> the curl multidownload the server retry logic is going to move to a curl
> callback. And the callback needs to be able to reconstruct the 'next'
> fileurl. One will be able to do it by getting the next server url from
> 'servers' list and then concat with filepath. Once the 'parallel download'
> refactoring is over 'fileurl' field will go away.
> 
> Signed-off-by: Anatol Pomozov <anatol.pomozov@gmail.com>
> ---
>  lib/libalpm/alpm.h    |   2 +
>  lib/libalpm/be_sync.c | 132 ++++++++++++++++++++++++++++++++++++++++++
>  lib/libalpm/dload.c   |  12 ++++
>  lib/libalpm/dload.h   |   5 ++
>  4 files changed, 151 insertions(+)
> 
> diff --git a/lib/libalpm/alpm.h b/lib/libalpm/alpm.h
> index 93b97f44..eb0490eb 100644
> --- a/lib/libalpm/alpm.h
> +++ b/lib/libalpm/alpm.h
> @@ -1045,6 +1045,8 @@ int alpm_db_remove_server(alpm_db_t *db, const char *url);
>   */
>  int alpm_db_update(int force, alpm_db_t *db);
>  
> +int alpm_dbs_update(alpm_handle_t *handle, alpm_list_t *dbs, int force, int failfast);

As already mentioned in the other reply, I don't think failfast is needed.

It is a bit annoying that "force" and "dbs" are in a different order to
alpm_db_update, but I think this is the better order...  so keep it.

> +
>  /** Get a package entry from a package database.
>   * @param db pointer to the package database to get the package from
>   * @param name of the package
> diff --git a/lib/libalpm/be_sync.c b/lib/libalpm/be_sync.c
> index aafed15d..cdb46bd9 100644
> --- a/lib/libalpm/be_sync.c
> +++ b/lib/libalpm/be_sync.c
> @@ -301,6 +301,138 @@ int SYMEXPORT alpm_db_update(int force, alpm_db_t *db)
>  	return ret;
>  }
>  
> +/** Update list of databases. This function may run updates in parallel.
> + *
> + * @param dbs a list of alpm_db_t to update.
> + */

This should be in alpm.h and all params and return need documented.

> +int SYMEXPORT alpm_dbs_update(alpm_handle_t *handle, alpm_list_t *dbs, int force, UNUSED int failfast) {
> +	char *syncpath;
> +	const char *dbext = handle->dbext;
> +	alpm_list_t *i;
> +	int ret = -1;
> +	mode_t oldmask;
> +	alpm_list_t *payloads = NULL;
> +
> +	/* Sanity checks */
> +	ASSERT(dbs != NULL, return -1);
> +	handle->pm_errno = ALPM_ERR_OK;
> +
> +	syncpath = get_sync_dir(handle);
> +	ASSERT(syncpath != NULL, return -1);
> +
> +	/* make sure we have a sane umask */
> +	oldmask = umask(0022);
> +
> +	for(i = dbs; i; i = i->next) {
> +		alpm_db_t *db = i->data;
> +		int dbforce = force;
> +		struct dload_payload *payload = NULL;
> +		size_t len;
> +		int siglevel;
> +
> +		if(!(db->usage & ALPM_DB_USAGE_SYNC)) {
> +			continue;
> +		}
> +
> +		ASSERT(db != handle->db_local, RET_ERR(handle, ALPM_ERR_WRONG_ARGS, -1));
> +		ASSERT(db->servers != NULL, RET_ERR(handle, ALPM_ERR_SERVER_NONE, -1));
> +
> +		/* force update of invalid databases to fix potential mismatched database/signature */
> +		if(db->status & DB_STATUS_INVALID) {
> +			dbforce = 1;
> +		}
> +
> +		CALLOC(payload, 1, sizeof(*payload), RET_ERR(handle, ALPM_ERR_MEMORY, -1));
> +
> +		/* set hard upper limit of 128MiB */
> +		payload->max_size = 128 * 1024 * 1024;
> +		ASSERT(db->servers != NULL, RET_ERR(handle, ALPM_ERR_SERVER_NONE, -1));
> +		payload->servers = db->servers;
> +

I got to here.  Seems a lot of this is duplicated from the single db
path.  If both are going to coexist, can we do some refactoring?

A
Anatol Pomozov March 8, 2020, 8:55 p.m. UTC | #4
Hi

The failfast option got removed.

On Sun, Mar 8, 2020 at 6:05 AM Allan McRae <allan@archlinux.org> wrote:
>
> On 7/3/20 6:35 am, Anatol Pomozov wrote:
> > This is an equivalent of alpm_db_update but for multiplexed (parallel)
> > download. The difference is that this function accepts list of
> > databases to update. And then ALPM internals download it in parallel if
> > possible.
> >
> > Add a stub for _alpm_multi_download the function that will do parallel
> > payloads downloads in the future.
>
> Thanks for splitting the patches up into smaller units.  It makes it a
> lot easier for me to review.

Yep. I'll try to split the work into a smaller patches around 100-200
lines each for easier and more modular review.

> Note that once review is complete, this patch (and the config one) will
> sit on a branch until the rest of the work is ready to land in one go.

It is fine. But please share this branch with me so I can rebase my
work on top of the reviewed/accepted changes. I do not want to keep
more than a couple of changes in my development branch. Once review
for previous changes is done I'll start working on the new changes.

>
> In fact... it probably makes sense to add _alpm_multi_download first,
> then follow-up with this patch.

_alpm_multi_download is not the whole part of the story. It depends on
another large chunk of download.c changes (that it is going to be in a
separate patch). It is easier for me to start from alpm_dbs_update()
part.

>
> > Introduce dload_payload->filepath field that contains url path to the
> > file we download. It is like fileurl field but does not contain
> > protocol/server part. The rationale for having this field is that with
> > the curl multidownload the server retry logic is going to move to a curl
> > callback. And the callback needs to be able to reconstruct the 'next'
> > fileurl. One will be able to do it by getting the next server url from
> > 'servers' list and then concat with filepath. Once the 'parallel download'
> > refactoring is over 'fileurl' field will go away.
> >
> > Signed-off-by: Anatol Pomozov <anatol.pomozov@gmail.com>
> > ---
> >  lib/libalpm/alpm.h    |   2 +
> >  lib/libalpm/be_sync.c | 132 ++++++++++++++++++++++++++++++++++++++++++
> >  lib/libalpm/dload.c   |  12 ++++
> >  lib/libalpm/dload.h   |   5 ++
> >  4 files changed, 151 insertions(+)
> >
> > diff --git a/lib/libalpm/alpm.h b/lib/libalpm/alpm.h
> > index 93b97f44..eb0490eb 100644
> > --- a/lib/libalpm/alpm.h
> > +++ b/lib/libalpm/alpm.h
> > @@ -1045,6 +1045,8 @@ int alpm_db_remove_server(alpm_db_t *db, const char *url);
> >   */
> >  int alpm_db_update(int force, alpm_db_t *db);
> >
> > +int alpm_dbs_update(alpm_handle_t *handle, alpm_list_t *dbs, int force, int failfast);
>
> As already mentioned in the other reply, I don't think failfast is needed.
>
> It is a bit annoying that "force" and "dbs" are in a different order to
> alpm_db_update, but I think this is the better order...  so keep it.
>
> > +
> >  /** Get a package entry from a package database.
> >   * @param db pointer to the package database to get the package from
> >   * @param name of the package
> > diff --git a/lib/libalpm/be_sync.c b/lib/libalpm/be_sync.c
> > index aafed15d..cdb46bd9 100644
> > --- a/lib/libalpm/be_sync.c
> > +++ b/lib/libalpm/be_sync.c
> > @@ -301,6 +301,138 @@ int SYMEXPORT alpm_db_update(int force, alpm_db_t *db)
> >       return ret;
> >  }
> >
> > +/** Update list of databases. This function may run updates in parallel.
> > + *
> > + * @param dbs a list of alpm_db_t to update.
> > + */
>
> This should be in alpm.h and all params and return need documented.

docs are moved to alpm.h

>
> > +int SYMEXPORT alpm_dbs_update(alpm_handle_t *handle, alpm_list_t *dbs, int force, UNUSED int failfast) {
> > +     char *syncpath;
> > +     const char *dbext = handle->dbext;
> > +     alpm_list_t *i;
> > +     int ret = -1;
> > +     mode_t oldmask;
> > +     alpm_list_t *payloads = NULL;
> > +
> > +     /* Sanity checks */
> > +     ASSERT(dbs != NULL, return -1);
> > +     handle->pm_errno = ALPM_ERR_OK;
> > +
> > +     syncpath = get_sync_dir(handle);
> > +     ASSERT(syncpath != NULL, return -1);
> > +
> > +     /* make sure we have a sane umask */
> > +     oldmask = umask(0022);
> > +
> > +     for(i = dbs; i; i = i->next) {
> > +             alpm_db_t *db = i->data;
> > +             int dbforce = force;
> > +             struct dload_payload *payload = NULL;
> > +             size_t len;
> > +             int siglevel;
> > +
> > +             if(!(db->usage & ALPM_DB_USAGE_SYNC)) {
> > +                     continue;
> > +             }
> > +
> > +             ASSERT(db != handle->db_local, RET_ERR(handle, ALPM_ERR_WRONG_ARGS, -1));
> > +             ASSERT(db->servers != NULL, RET_ERR(handle, ALPM_ERR_SERVER_NONE, -1));
> > +
> > +             /* force update of invalid databases to fix potential mismatched database/signature */
> > +             if(db->status & DB_STATUS_INVALID) {
> > +                     dbforce = 1;
> > +             }
> > +
> > +             CALLOC(payload, 1, sizeof(*payload), RET_ERR(handle, ALPM_ERR_MEMORY, -1));
> > +
> > +             /* set hard upper limit of 128MiB */
> > +             payload->max_size = 128 * 1024 * 1024;
> > +             ASSERT(db->servers != NULL, RET_ERR(handle, ALPM_ERR_SERVER_NONE, -1));
> > +             payload->servers = db->servers;
> > +
>
> I got to here.  Seems a lot of this is duplicated from the single db
> path.  If both are going to coexist, can we do some refactoring?

It depends whether we want to keep the API backward-compatible. If it
is fine to break one in pacman 6 release then we can just remove the
function from ALPM API. Otherwise alpm_db_update() need to be
reimplemented using alpm_dbs_update() functionality.
Eli Schwartz March 9, 2020, 12:12 a.m. UTC | #5
On 3/8/20 4:55 PM, Anatol Pomozov wrote:
>> Note that once review is complete, this patch (and the config one) will
>> sit on a branch until the rest of the work is ready to land in one go.
> 
> It is fine. But please share this branch with me so I can rebase my
> work on top of the reviewed/accepted changes. I do not want to keep
> more than a couple of changes in my development branch. Once review
> for previous changes is done I'll start working on the new changes.
You should in the future be able to find the branch in the same place
Allan publishes other WIP feature branches, which, for future notice, is
here:

https://git.archlinux.org/users/allan/pacman.git/refs/
Allan McRae March 9, 2020, 12:14 a.m. UTC | #6
On 9/3/20 6:55 am, Anatol Pomozov wrote:
> On Sun, Mar 8, 2020 at 6:05 AM Allan McRae <allan@archlinux.org> wrote:

>> I got to here.  Seems a lot of this is duplicated from the single db
>> path.  If both are going to coexist, can we do some refactoring?
> 
> It depends whether we want to keep the API backward-compatible. If it
> is fine to break one in pacman 6 release then we can just remove the
> function from ALPM API. Otherwise alpm_db_update() need to be
> reimplemented using alpm_dbs_update() functionality.
> 

I was thinking that a non-pacman frontend may want to update a single
db.  But I suppose they just pass a single db to alpm_dbs_update().

So, I'm OK with the temporary code duplication followed by change of API
for 6.0.

A
Anatol Pomozov March 9, 2020, 7:35 p.m. UTC | #7
Hi

On Sun, Mar 8, 2020 at 5:15 PM Allan McRae <allan@archlinux.org> wrote:
>
> On 9/3/20 6:55 am, Anatol Pomozov wrote:
> > On Sun, Mar 8, 2020 at 6:05 AM Allan McRae <allan@archlinux.org> wrote:
>
> >> I got to here.  Seems a lot of this is duplicated from the single db
> >> path.  If both are going to coexist, can we do some refactoring?
> >
> > It depends whether we want to keep the API backward-compatible. If it
> > is fine to break one in pacman 6 release then we can just remove the
> > function from ALPM API. Otherwise alpm_db_update() need to be
> > reimplemented using alpm_dbs_update() functionality.
> >
>
> I was thinking that a non-pacman frontend may want to update a single
> db.  But I suppose they just pass a single db to alpm_dbs_update().

Yep. Passing a single element list to alpm_dbs_update() is exact equivalent of
alpm_db_update() functionality.

>
> So, I'm OK with the temporary code duplication followed by change of API
> for 6.0.

Ok. I will remove alpm_db_update() at the end of this patch series.
Allan McRae March 13, 2020, 12:04 a.m. UTC | #8
On 10/3/20 5:35 am, Anatol Pomozov wrote:
> Hi
> 
> On Sun, Mar 8, 2020 at 5:15 PM Allan McRae <allan@archlinux.org> wrote:
>>
>> On 9/3/20 6:55 am, Anatol Pomozov wrote:
>>> On Sun, Mar 8, 2020 at 6:05 AM Allan McRae <allan@archlinux.org> wrote:
>>
>>>> I got to here.  Seems a lot of this is duplicated from the single db
>>>> path.  If both are going to coexist, can we do some refactoring?
>>>
>>> It depends whether we want to keep the API backward-compatible. If it
>>> is fine to break one in pacman 6 release then we can just remove the
>>> function from ALPM API. Otherwise alpm_db_update() need to be
>>> reimplemented using alpm_dbs_update() functionality.
>>>
>>
>> I was thinking that a non-pacman frontend may want to update a single
>> db.  But I suppose they just pass a single db to alpm_dbs_update().
> 
> Yep. Passing a single element list to alpm_dbs_update() is exact equivalent of
> alpm_db_update() functionality.
> 
>>
>> So, I'm OK with the temporary code duplication followed by change of API
>> for 6.0.
> 
> Ok. I will remove alpm_db_update() at the end of this patch series.
> 

Well...  thinking about this some more, at the end of the patch the
current alpm_db_update() and alpm_dbs_update() should be renamed to take
its place.   All our API for db operations is of the form alpm_db_... so
we should keep it that way.

A
Allan McRae March 13, 2020, 12:06 a.m. UTC | #9
On 10/3/20 5:35 am, Anatol Pomozov wrote:
> Hi
> 
> On Sun, Mar 8, 2020 at 5:15 PM Allan McRae <allan@archlinux.org> wrote:
>>
>> On 9/3/20 6:55 am, Anatol Pomozov wrote:
>>> On Sun, Mar 8, 2020 at 6:05 AM Allan McRae <allan@archlinux.org> wrote:
>>
>>>> I got to here.  Seems a lot of this is duplicated from the single db
>>>> path.  If both are going to coexist, can we do some refactoring?
>>>
>>> It depends whether we want to keep the API backward-compatible. If it
>>> is fine to break one in pacman 6 release then we can just remove the
>>> function from ALPM API. Otherwise alpm_db_update() need to be
>>> reimplemented using alpm_dbs_update() functionality.
>>>
>>
>> I was thinking that a non-pacman frontend may want to update a single
>> db.  But I suppose they just pass a single db to alpm_dbs_update().
> 
> Yep. Passing a single element list to alpm_dbs_update() is exact equivalent of
> alpm_db_update() functionality.
> 
>>
>> So, I'm OK with the temporary code duplication followed by change of API
>> for 6.0.
> 
> Ok. I will remove alpm_db_update() at the end of this patch series.
> 

Well...  thinking about this some more, at the end of the patch the
current alpm_db_update() and alpm_dbs_update() should be renamed to take
its place.   All our API for db operations is of the form alpm_db_... so
we should keep it that way.

A
Allan McRae March 13, 2020, 12:29 a.m. UTC | #10
On 13/3/20 10:06 am, Allan McRae wrote:
> On 10/3/20 5:35 am, Anatol Pomozov wrote:
>> Hi
>>
>> On Sun, Mar 8, 2020 at 5:15 PM Allan McRae <allan@archlinux.org> wrote:
>>>
>>> On 9/3/20 6:55 am, Anatol Pomozov wrote:
>>>> On Sun, Mar 8, 2020 at 6:05 AM Allan McRae <allan@archlinux.org> wrote:
>>>
>>>>> I got to here.  Seems a lot of this is duplicated from the single db
>>>>> path.  If both are going to coexist, can we do some refactoring?
>>>>
>>>> It depends whether we want to keep the API backward-compatible. If it
>>>> is fine to break one in pacman 6 release then we can just remove the
>>>> function from ALPM API. Otherwise alpm_db_update() need to be
>>>> reimplemented using alpm_dbs_update() functionality.
>>>>
>>>
>>> I was thinking that a non-pacman frontend may want to update a single
>>> db.  But I suppose they just pass a single db to alpm_dbs_update().
>>
>> Yep. Passing a single element list to alpm_dbs_update() is exact equivalent of
>> alpm_db_update() functionality.
>>
>>>
>>> So, I'm OK with the temporary code duplication followed by change of API
>>> for 6.0.
>>
>> Ok. I will remove alpm_db_update() at the end of this patch series.
>>
> 
> Well...  thinking about this some more, at the end of the patch the
> current alpm_db_update() and alpm_dbs_update() should be renamed to take
> its place.   All our API for db operations is of the form alpm_db_... so
> we should keep it that way.
> 

Words are hard!

... at the end of the patchset the current alpm_db_update() should be
removed and alpm_dbs_update() should be renamed ...

Patch

diff --git a/lib/libalpm/alpm.h b/lib/libalpm/alpm.h
index 93b97f44..eb0490eb 100644
--- a/lib/libalpm/alpm.h
+++ b/lib/libalpm/alpm.h
@@ -1045,6 +1045,8 @@  int alpm_db_remove_server(alpm_db_t *db, const char *url);
  */
 int alpm_db_update(int force, alpm_db_t *db);
 
+int alpm_dbs_update(alpm_handle_t *handle, alpm_list_t *dbs, int force, int failfast);
+
 /** Get a package entry from a package database.
  * @param db pointer to the package database to get the package from
  * @param name of the package
diff --git a/lib/libalpm/be_sync.c b/lib/libalpm/be_sync.c
index aafed15d..cdb46bd9 100644
--- a/lib/libalpm/be_sync.c
+++ b/lib/libalpm/be_sync.c
@@ -301,6 +301,138 @@  int SYMEXPORT alpm_db_update(int force, alpm_db_t *db)
 	return ret;
 }
 
+/** Update list of databases. This function may run updates in parallel.
+ *
+ * @param dbs a list of alpm_db_t to update.
+ */
+int SYMEXPORT alpm_dbs_update(alpm_handle_t *handle, alpm_list_t *dbs, int force, UNUSED int failfast) {
+	char *syncpath;
+	const char *dbext = handle->dbext;
+	alpm_list_t *i;
+	int ret = -1;
+	mode_t oldmask;
+	alpm_list_t *payloads = NULL;
+
+	/* Sanity checks */
+	ASSERT(dbs != NULL, return -1);
+	handle->pm_errno = ALPM_ERR_OK;
+
+	syncpath = get_sync_dir(handle);
+	ASSERT(syncpath != NULL, return -1);
+
+	/* make sure we have a sane umask */
+	oldmask = umask(0022);
+
+	for(i = dbs; i; i = i->next) {
+		alpm_db_t *db = i->data;
+		int dbforce = force;
+		struct dload_payload *payload = NULL;
+		size_t len;
+		int siglevel;
+
+		if(!(db->usage & ALPM_DB_USAGE_SYNC)) {
+			continue;
+		}
+
+		ASSERT(db != handle->db_local, RET_ERR(handle, ALPM_ERR_WRONG_ARGS, -1));
+		ASSERT(db->servers != NULL, RET_ERR(handle, ALPM_ERR_SERVER_NONE, -1));
+
+		/* force update of invalid databases to fix potential mismatched database/signature */
+		if(db->status & DB_STATUS_INVALID) {
+			dbforce = 1;
+		}
+
+		CALLOC(payload, 1, sizeof(*payload), RET_ERR(handle, ALPM_ERR_MEMORY, -1));
+
+		/* set hard upper limit of 128MiB */
+		payload->max_size = 128 * 1024 * 1024;
+		ASSERT(db->servers != NULL, RET_ERR(handle, ALPM_ERR_SERVER_NONE, -1));
+		payload->servers = db->servers;
+
+		/* print server + filename into a buffer */
+		len = strlen(db->treename) + strlen(dbext) + 1;
+		MALLOC(payload->filepath, len, GOTO_ERR(handle, ALPM_ERR_MEMORY, cleanup));
+		snprintf(payload->filepath, len, "%s%s", db->treename, dbext);
+		payload->handle = handle;
+		payload->force = dbforce;
+		payload->unlink_on_fail = 1;
+
+		payloads = alpm_list_add(payloads, payload);
+
+		siglevel = alpm_db_get_siglevel(db);
+		if(siglevel & ALPM_SIG_DATABASE) {
+			struct dload_payload *sig_payload;
+			CALLOC(sig_payload, 1, sizeof(*sig_payload), GOTO_ERR(handle, ALPM_ERR_MEMORY, cleanup));
+
+			/* print filename into a buffer (leave space for separator and .sig) */
+			len = strlen(db->treename) + strlen(dbext) + 5;
+			MALLOC(sig_payload->filepath, len, goto cleanup);
+			snprintf(sig_payload->filepath, len, "%s%s.sig", db->treename, dbext);
+
+			sig_payload->handle = handle;
+			sig_payload->force = dbforce;
+			sig_payload->errors_ok = (siglevel & ALPM_SIG_DATABASE_OPTIONAL);
+
+			/* set hard upper limit of 16KiB */
+			sig_payload->max_size = 16 * 1024;
+			sig_payload->servers = db->servers;
+
+			payloads = alpm_list_add(payloads, sig_payload);
+		}
+	}
+
+	/* attempt to grab a lock */
+	if(_alpm_handle_lock(handle)) {
+		GOTO_ERR(handle, ALPM_ERR_HANDLE_LOCK, cleanup);
+	}
+
+	ret = _alpm_multi_download(handle, payloads, syncpath);
+	if(ret < 0) {
+		goto cleanup;
+	}
+
+	for(i = dbs; i; i = i->next) {
+		alpm_db_t *db = i->data;
+		if(!(db->usage & ALPM_DB_USAGE_SYNC)) {
+			continue;
+		}
+
+		/* Cache needs to be rebuilt */
+		_alpm_db_free_pkgcache(db);
+
+		/* clear all status flags regarding validity/existence */
+		db->status &= ~DB_STATUS_VALID;
+		db->status &= ~DB_STATUS_INVALID;
+		db->status &= ~DB_STATUS_EXISTS;
+		db->status &= ~DB_STATUS_MISSING;
+
+		/* if the download failed skip validation to preserve the download error */
+		if(sync_db_validate(db) != 0) {
+			/* pm_errno should be set */
+			ret = -1;
+		}
+	}
+
+cleanup:
+	_alpm_handle_unlock(handle);
+
+	if(ret == -1) {
+		/* pm_errno was set by the download code */
+		_alpm_log(handle, ALPM_LOG_DEBUG, "failed to sync dbs: %s\n",
+				alpm_strerror(handle->pm_errno));
+	} else {
+		handle->pm_errno = ALPM_ERR_OK;
+	}
+
+	if(payloads) {
+		alpm_list_free_inner(payloads, (alpm_list_fn_free)_alpm_dload_payload_reset);
+		FREELIST(payloads);
+	}
+	free(syncpath);
+	umask(oldmask);
+	return ret;
+}
+
 /* Forward decl so I don't reorganize the whole file right now */
 static int sync_db_read(alpm_db_t *db, struct archive *archive,
 		struct archive_entry *entry, alpm_pkg_t **likely_pkg);
diff --git a/lib/libalpm/dload.c b/lib/libalpm/dload.c
index 670da03d..7cd3e3a4 100644
--- a/lib/libalpm/dload.c
+++ b/lib/libalpm/dload.c
@@ -636,6 +636,16 @@  int _alpm_download(struct dload_payload *payload, const char *localpath,
 	}
 }
 
+int _alpm_multi_download(alpm_handle_t *handle,
+		alpm_list_t *payloads /* struct dload_payload */,
+		const char *localpath)
+{
+	(void)handle;
+	(void)payloads;
+	(void)localpath;
+	return 0;
+}
+
 static char *filecache_find_url(alpm_handle_t *handle, const char *url)
 {
 	const char *filebase = strrchr(url, '/');
@@ -738,6 +748,7 @@  void _alpm_dload_payload_reset(struct dload_payload *payload)
 	FREE(payload->destfile_name);
 	FREE(payload->content_disp_name);
 	FREE(payload->fileurl);
+	FREE(payload->filepath);
 	*payload = (struct dload_payload){0};
 }
 
@@ -746,6 +757,7 @@  void _alpm_dload_payload_reset_for_retry(struct dload_payload *payload)
 	ASSERT(payload, return);
 
 	FREE(payload->fileurl);
+	FREE(payload->filepath);
 	payload->initial_size += payload->prevprogress;
 	payload->prevprogress = 0;
 	payload->unlink_on_fail = 0;
diff --git a/lib/libalpm/dload.h b/lib/libalpm/dload.h
index 1e8f75f3..3eb7fbe1 100644
--- a/lib/libalpm/dload.h
+++ b/lib/libalpm/dload.h
@@ -31,6 +31,7 @@  struct dload_payload {
 	char *destfile_name;
 	char *content_disp_name;
 	char *fileurl;
+	char *filepath; /* download URL path */
 	alpm_list_t *servers;
 	long respcode;
 	off_t initial_size;
@@ -53,4 +54,8 @@  void _alpm_dload_payload_reset_for_retry(struct dload_payload *payload);
 int _alpm_download(struct dload_payload *payload, const char *localpath,
 		char **final_file, const char **final_url);
 
+int _alpm_multi_download(alpm_handle_t *handle,
+		alpm_list_t *payloads /* struct dload_payload */,
+		const char *localpath);
+
 #endif /* ALPM_DLOAD_H */