From ac34e358103ee718369692b9ba5afe6830a1df92 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Wed, 30 Nov 2011 12:25:54 -0600 Subject: reporead: fix filesonly needs update checks This was broken after the select for update changes. We really should split the whole filesonly update into another method instead of the current shotgun approach with conditionals everywhere. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index cf101d97..f8cc2034 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -382,11 +382,13 @@ def db_update(archname, reponame, pkgs, options): # files/depends/all related items to be double-imported. if filesonly: with transaction.commit_on_success(): + if not dbpkg.files_last_update or not dbpkg.last_update: + pass + elif dbpkg.files_last_update > dbpkg.last_update: + logger.debug("Files for %s are up to date", pkg.name) + continue # TODO Django 1.4 select_for_update() will work once released dbpkg = select_pkg_for_update(dbpkg) - if pkg_same_version(pkg, dbpkg): - logger.debug("Package %s was already updated", pkg.name) - continue logger.debug("Checking files for package %s", pkg.name) populate_files(dbpkg, pkg, force=force) else: -- cgit v1.2.3-2-g168b From dca9fd2ea687b8d20fd5c39c1449846ddef1e3c2 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Wed, 30 Nov 2011 13:04:15 -0600 Subject: reporead: split out filesonly update method This removes a bunch of the conditional logic at a slight cost of some code duplication. However, the methods and madness is now much easier to follow. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 170 +++++++++++++++++++--------------- 1 file changed, 95 insertions(+), 75 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index f8cc2034..e4ba8580 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -282,40 +282,20 @@ def select_pkg_for_update(dbpkg): return list(new_pkg)[0] -def db_update(archname, reponame, pkgs, options): - """ - Parses a list and updates the Arch dev database accordingly. - - Arguments: - pkgs -- A list of Pkg objects. - - """ - logger.info('Updating Arch: %s', archname) - force = options.get('force', False) - filesonly = options.get('filesonly', False) - +def update_common(archname, reponame, pkgs, sanity_check=True): with transaction.commit_manually(): repository = Repo.objects.get(name__iexact=reponame) architecture = Arch.objects.get(name__iexact=archname) # no-arg order_by() removes even the default ordering; we don't need it dbpkgs = Package.objects.filter( arch=architecture, repo=repository).order_by() - # This makes our inner loop where we find packages by name *way* more - # efficient by not having to go to the database for each package to - # SELECT them by name. - dbdict = dict((dbpkg.pkgname, dbpkg) for dbpkg in dbpkgs) - - logger.debug("Creating sets") - dbset = set(dbdict.keys()) - syncset = set([pkg.name for pkg in pkgs]) - logger.info("%d packages in current web DB", len(dbset)) - logger.info("%d packages in new updating db", len(syncset)) - in_sync_not_db = syncset - dbset - logger.info("%d packages in sync not db", len(in_sync_not_db)) + + logger.info("%d packages in current web DB", len(dbpkgs)) + logger.info("%d packages in new updating DB", len(pkgs)) # Try to catch those random package deletions that make Eric so unhappy. - if len(dbset): - dbpercent = 100.0 * len(syncset) / len(dbset) + if len(dbpkgs): + dbpercent = 100.0 * len(pkgs) / len(dbpkgs) else: dbpercent = 0.0 logger.info("DB package ratio: %.1f%%", dbpercent) @@ -324,11 +304,13 @@ def db_update(archname, reponame, pkgs, options): # means we expect the repo to fluctuate a lot. msg = "Package database has %.1f%% the number of packages in the " \ "web database" % dbpercent - if len(dbset) == 0 and len(syncset) == 0: + if not sanity_check: + pass + elif repository.testing or repository.staging: pass - elif not filesonly and \ - len(dbset) > 20 and dbpercent < 50.0 and \ - not repository.testing and not repository.staging: + elif len(dbpkgs) == 0 and len(pkgs) == 0: + pass + elif len(dbpkgs) > 20 and dbpercent < 50.0: logger.error(msg) raise Exception(msg) elif dbpercent < 75.0: @@ -339,27 +321,45 @@ def db_update(archname, reponame, pkgs, options): # to guard against simultaneous updates transaction.commit() - if not filesonly: - # packages in syncdb and not in database (add to database) - for pkg in (pkg for pkg in pkgs if pkg.name in in_sync_not_db): - logger.info("Adding package %s", pkg.name) - dbpkg = Package(pkgname=pkg.name, arch=architecture, repo=repository) - try: - with transaction.commit_on_success(): - populate_pkg(dbpkg, pkg, timestamp=datetime.utcnow()) - except IntegrityError: - logger.warning("Could not add package %s; " - "not fatal if another thread beat us to it.", - pkg.name, exc_info=True) - - # packages in database and not in syncdb (remove from database) - for pkgname in (dbset - syncset): - logger.info("Removing package %s", pkgname) - dbpkg = dbdict[pkgname] + return dbpkgs + +def db_update(archname, reponame, pkgs, force=False): + """ + Parses a list of packages and updates the packages database accordingly. + """ + logger.info('Updating %s (%s)', reponame, archname) + dbpkgs = update_common(archname, reponame, pkgs, True) + + # This makes our inner loop where we find packages by name *way* more + # efficient by not having to go to the database for each package to + # SELECT them by name. + dbdict = dict((dbpkg.pkgname, dbpkg) for dbpkg in dbpkgs) + + dbset = set(dbdict.keys()) + syncset = set([pkg.name for pkg in pkgs]) + + in_sync_not_db = syncset - dbset + logger.info("%d packages in sync not db", len(in_sync_not_db)) + # packages in syncdb and not in database (add to database) + for pkg in (pkg for pkg in pkgs if pkg.name in in_sync_not_db): + logger.info("Adding package %s", pkg.name) + dbpkg = Package(pkgname=pkg.name, arch=architecture, repo=repository) + try: with transaction.commit_on_success(): - # no race condition here as long as simultaneous threads both - # issue deletes; second delete will be a no-op - dbpkg.delete() + populate_pkg(dbpkg, pkg, timestamp=datetime.utcnow()) + except IntegrityError: + logger.warning("Could not add package %s; " + "not fatal if another thread beat us to it.", + pkg.name, exc_info=True) + + # packages in database and not in syncdb (remove from database) + for pkgname in (dbset - syncset): + logger.info("Removing package %s", pkgname) + dbpkg = dbdict[pkgname] + with transaction.commit_on_success(): + # no race condition here as long as simultaneous threads both + # issue deletes; second delete will be a no-op + dbpkg.delete() # packages in both database and in syncdb (update in database) pkg_in_both = syncset & dbset @@ -369,9 +369,7 @@ def db_update(archname, reponame, pkgs, options): timestamp = None # for a force, we don't want to update the timestamp. # for a non-force, we don't want to do anything at all. - if filesonly: - pass - elif pkg_same_version(pkg, dbpkg): + if pkg_same_version(pkg, dbpkg): if not force: continue else: @@ -380,28 +378,45 @@ def db_update(archname, reponame, pkgs, options): # The odd select_for_update song and dance here are to ensure # simultaneous updates don't happen on a package, causing # files/depends/all related items to be double-imported. - if filesonly: - with transaction.commit_on_success(): - if not dbpkg.files_last_update or not dbpkg.last_update: - pass - elif dbpkg.files_last_update > dbpkg.last_update: - logger.debug("Files for %s are up to date", pkg.name) - continue - # TODO Django 1.4 select_for_update() will work once released - dbpkg = select_pkg_for_update(dbpkg) - logger.debug("Checking files for package %s", pkg.name) - populate_files(dbpkg, pkg, force=force) - else: - with transaction.commit_on_success(): - # TODO Django 1.4 select_for_update() will work once released - dbpkg = select_pkg_for_update(dbpkg) - if pkg_same_version(pkg, dbpkg): - logger.debug("Package %s was already updated", pkg.name) - continue - logger.info("Updating package %s", pkg.name) - populate_pkg(dbpkg, pkg, force=force, timestamp=timestamp) + with transaction.commit_on_success(): + # TODO Django 1.4 select_for_update() will work once released + dbpkg = select_pkg_for_update(dbpkg) + if pkg_same_version(pkg, dbpkg): + logger.debug("Package %s was already updated", pkg.name) + continue + logger.info("Updating package %s", pkg.name) + populate_pkg(dbpkg, pkg, force=force, timestamp=timestamp) + + logger.info('Finished updating arch: %s', archname) + + +def filesonly_update(archname, reponame, pkgs, force=False): + """ + Parses a list of packages and updates the packages database accordingly. + """ + logger.info('Updating files for %s (%s)', reponame, archname) + dbpkgs = update_common(archname, reponame, pkgs, False) + dbdict = dict((dbpkg.pkgname, dbpkg) for dbpkg in dbpkgs) + dbset = set(dbdict.keys()) + + for pkg in (pkg for pkg in pkgs if pkg.name in dbset): + dbpkg = dbdict[pkg.name] + + # The odd select_for_update song and dance here are to ensure + # simultaneous updates don't happen on a package, causing + # files to be double-imported. + with transaction.commit_on_success(): + if not dbpkg.files_last_update or not dbpkg.last_update: + pass + elif dbpkg.files_last_update > dbpkg.last_update: + logger.debug("Files for %s are up to date", pkg.name) + continue + # TODO Django 1.4 select_for_update() will work once released + dbpkg = select_pkg_for_update(dbpkg) + logger.debug("Checking files for package %s", pkg.name) + populate_files(dbpkg, pkg, force=force) - logger.info('Finished updating Arch: %s', archname) + logger.info('Finished updating arch: %s', archname) def parse_info(iofile): @@ -490,6 +505,8 @@ def read_repo(primary_arch, repo_file, options): """ # always returns an Arch object, regardless of what is passed in primary_arch = locate_arch(primary_arch) + force = options.get('force', False) + filesonly = options.get('filesonly', False) repo, packages = parse_repo(repo_file) @@ -509,7 +526,10 @@ def read_repo(primary_arch, repo_file, options): logger.info('Starting database updates for %s.', repo_file) for arch in sorted(packages_arches.keys()): - db_update(arch, repo, packages_arches[arch], options) + if filesonly: + filesonly_update(arch, repo, packages_arches[arch], force) + else: + db_update(arch, repo, packages_arches[arch], force) logger.info('Finished database updates for %s.', repo_file) return 0 -- cgit v1.2.3-2-g168b From 4d02cd5b5d4437dd1543e2d45044db72da1989f4 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Wed, 30 Nov 2011 23:56:12 -0600 Subject: reporead: fix not defined variable Way to fail at refactoring, Dan. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index e4ba8580..c444538b 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -329,6 +329,8 @@ def db_update(archname, reponame, pkgs, force=False): """ logger.info('Updating %s (%s)', reponame, archname) dbpkgs = update_common(archname, reponame, pkgs, True) + repository = Repo.objects.get(name__iexact=reponame) + architecture = Arch.objects.get(name__iexact=archname) # This makes our inner loop where we find packages by name *way* more # efficient by not having to go to the database for each package to -- cgit v1.2.3-2-g168b