From b6f86d9ab0d6910f0f70398b07e965d337bd9e78 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Tue, 16 Aug 2011 16:04:16 -0500 Subject: Add two new DB fields to reporead Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index 470b785d..97fdbb73 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -73,7 +73,7 @@ class Command(BaseCommand): class Pkg(object): """An interim 'container' object for holding Arch package data.""" bare = ( 'name', 'base', 'arch', 'desc', 'filename', - 'md5sum', 'url', 'packager' ) + 'md5sum', 'sha256sum', 'pgpsig', 'url', 'packager' ) number = ( 'csize', 'isize' ) collections = ( 'depends', 'optdepends', 'conflicts', 'provides', 'replaces', 'groups', 'license', 'files' ) -- cgit v1.2.3-2-g168b From e5d09fb7e9003b7f96685af9c0a722b45746448e Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Wed, 17 Aug 2011 16:18:12 -0500 Subject: Add PGP signature package field And add eventual display code for it to the details template, but don't show it yet as no packages will have it. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 1 + 1 file changed, 1 insertion(+) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index 97fdbb73..cf597577 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -203,6 +203,7 @@ def populate_pkg(dbpkg, repopkg, force=False, timestamp=None): dbpkg.packager_str = repopkg.packager # attempt to find the corresponding django user for this string dbpkg.packager = finder.find(repopkg.packager) + dbpkg.pgp_signature = repopkg.pgpsig if timestamp: dbpkg.flag_date = None -- cgit v1.2.3-2-g168b From 1b83844b30d3271b8fb50757d827c7b8fe8b5585 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Wed, 26 Oct 2011 03:25:15 -0500 Subject: Ensure PGP signature values are not trimmed This makes them totally unusable for any real purpose down the road. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index cf597577..a8e3219e 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -73,7 +73,7 @@ class Command(BaseCommand): class Pkg(object): """An interim 'container' object for holding Arch package data.""" bare = ( 'name', 'base', 'arch', 'desc', 'filename', - 'md5sum', 'sha256sum', 'pgpsig', 'url', 'packager' ) + 'md5sum', 'sha256sum', 'url', 'packager' ) number = ( 'csize', 'isize' ) collections = ( 'depends', 'optdepends', 'conflicts', 'provides', 'replaces', 'groups', 'license', 'files' ) @@ -85,6 +85,7 @@ class Pkg(object): self.ver = None self.rel = None self.epoch = 0 + self.pgpsig = None for k in self.bare + self.number: setattr(self, k, None) for k in self.collections: @@ -99,6 +100,9 @@ class Pkg(object): setattr(self, k, v[0][:254]) elif k in self.number: setattr(self, k, long(v[0])) + elif k == 'pgpsig': + # do NOT prune this value at all + setattr(self, k, v[0]) elif k == 'version': match = self.version_re.match(v[0]) self.ver = match.group(3) -- cgit v1.2.3-2-g168b From 9550236a87fc65827e994bea108350a43d3f161f Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Tue, 15 Nov 2011 21:26:57 -0600 Subject: Improve primary arch validation Ensure we can accept either a Arch object or an architecture name when passed to read_repo() by moving the validation there and being a bit more careful about typechecking and object lookup. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index a8e3219e..b4966834 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -51,8 +51,6 @@ class Command(BaseCommand): def handle(self, arch=None, filename=None, **options): if not arch: raise CommandError('Architecture is required.') - if not validate_arch(arch): - raise CommandError('Specified architecture %s is not currently known.' % arch) if not filename: raise CommandError('Package database file is required.') filename = os.path.normpath(filename) @@ -464,29 +462,38 @@ def parse_repo(repopath): logger.info("Finished repo parsing, %d total packages", len(pkgs)) return (reponame, pkgs.values()) -def validate_arch(archname): +def locate_arch(arch): "Check if arch is valid." - return Arch.objects.filter(name__iexact=archname).exists() + if isinstance(arch, Arch): + return arch + try: + return Arch.objects.get(name__iexact=arch) + except Arch.DoesNotExist: + raise CommandError( + 'Specified architecture %s is not currently known.' % arch) + def read_repo(primary_arch, repo_file, options): """ Parses repo.db.tar.gz file and returns exit status. """ + # always returns an Arch object, regardless of what is passed in + primary_arch = locate_arch(primary_arch) + repo, packages = parse_repo(repo_file) # group packages by arch -- to handle noarch stuff packages_arches = {} for arch in Arch.objects.filter(agnostic=True): packages_arches[arch.name] = [] - packages_arches[primary_arch] = [] + packages_arches[primary_arch.name] = [] for package in packages: if package.arch in packages_arches: packages_arches[package.arch].append(package) else: # we don't include mis-arched packages - logger.warning("Package %s arch = %s", - package.name, package.arch) + logger.warning("Package %s arch = %s", package.name, package.arch) del packages logger.info('Starting database updates.') -- cgit v1.2.3-2-g168b From c00e7e84045613ee2aa80f66b9972db971ab3f26 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Tue, 15 Nov 2011 23:43:42 -0600 Subject: reporead: clean up some debug logging Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index b4966834..45229524 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -36,6 +36,8 @@ logging.basicConfig( format='%(asctime)s -> %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', stream=sys.stderr) +TRACE = 5 +logging.addLevelName(TRACE, 'TRACE') logger = logging.getLogger() class Command(BaseCommand): @@ -368,7 +370,7 @@ def db_update(archname, reponame, pkgs, options): # packages in both database and in syncdb (update in database) pkg_in_both = syncset & dbset for p in [x for x in pkgs if x.name in pkg_in_both]: - logger.debug("Looking for package updates") + logger.debug("Checking package %s", p.name) dbp = dbdict[p.name] timestamp = None # for a force, we don't want to update the timestamp. @@ -406,7 +408,7 @@ def parse_info(iofile): continue elif line.startswith('%') and line.endswith('%'): blockname = line[1:-1].lower() - logger.debug("Parsing package block %s", blockname) + logger.log(TRACE, "Parsing package block %s", blockname) store[blockname] = [] elif blockname: store[blockname].append(line) @@ -456,7 +458,7 @@ def parse_repo(repopath): tarinfo.name) data_file.close() - logger.debug("Done parsing file %s", fname) + logger.debug("Done parsing file %s/%s", pkgid, fname) repodb.close() logger.info("Finished repo parsing, %d total packages", len(pkgs)) -- cgit v1.2.3-2-g168b From 404c4b400b2bd2a14e0363e33d66505c51903fe7 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Wed, 16 Nov 2011 12:48:36 -0600 Subject: reporead: a few small tweaks Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index 45229524..ad76db4d 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -450,13 +450,14 @@ def parse_repo(repopath): continue data_file = repodb.extractfile(tarinfo) data_file = io.TextIOWrapper(io.BytesIO(data_file.read()), - encoding='utf=8') + encoding='UTF-8') try: pkgs[pkgid].populate(parse_info(data_file)) except UnicodeDecodeError: logger.warn("Could not correctly decode %s, skipping file", tarinfo.name) data_file.close() + del data_file logger.debug("Done parsing file %s/%s", pkgid, fname) @@ -498,10 +499,10 @@ def read_repo(primary_arch, repo_file, options): logger.warning("Package %s arch = %s", package.name, package.arch) del packages - logger.info('Starting database updates.') + logger.info('Starting database updates for %s.', repo_file) for arch in sorted(packages_arches.keys()): db_update(arch, repo, packages_arches[arch], options) - logger.info('Finished database updates.') + logger.info('Finished database updates for %s.', repo_file) return 0 # vim: set ts=4 sw=4 et: -- cgit v1.2.3-2-g168b From a9819e3d715ce3e5c20c9665db9a6100f06ab562 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Thu, 17 Nov 2011 12:34:12 -0600 Subject: Ensure reporead is protected against simultaneous runs This adds a bunch of transaction magic and SELECT FOR UPDATE stuff to reporead to cope with the now-concurrent runs of reporead we get when invoked from our inotify-based updater. The collision occurs with 'any' architecture packages as both repo databases contain the new version, and the updates occur at exactly the same time. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 206 +++++++++++++++++----------------- 1 file changed, 106 insertions(+), 100 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index ad76db4d..b6bd8457 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -13,10 +13,6 @@ Example: ./manage.py reporead i686 /tmp/core.db.tar.gz """ -from django.core.management.base import BaseCommand, CommandError -from django.contrib.auth.models import User -from django.db import transaction - from collections import defaultdict import io import os @@ -27,6 +23,11 @@ import logging from datetime import datetime from optparse import make_option +from django.core.management.base import BaseCommand, CommandError +from django.contrib.auth.models import User +from django.db import connections, router, transaction +from django.db.utils import IntegrityError + from devel.utils import UserFinder from main.models import Arch, Package, PackageDepend, PackageFile, Repo from packages.models import Conflict, Provision, Replacement @@ -189,8 +190,6 @@ def create_multivalued(dbpkg, repopkg, db_attr, repo_attr): finder = UserFinder() def populate_pkg(dbpkg, repopkg, force=False, timestamp=None): - db_score = 1 - if repopkg.base: dbpkg.pkgbase = repopkg.base else: @@ -214,7 +213,7 @@ def populate_pkg(dbpkg, repopkg, force=False, timestamp=None): dbpkg.last_update = timestamp dbpkg.save() - db_score += populate_files(dbpkg, repopkg, force=force) + populate_files(dbpkg, repopkg, force=force) dbpkg.packagedepend_set.all().delete() for y in repopkg.depends: @@ -235,28 +234,23 @@ def populate_pkg(dbpkg, repopkg, force=False, timestamp=None): create_multivalued(dbpkg, repopkg, 'groups', 'groups') create_multivalued(dbpkg, repopkg, 'licenses', 'license') - related_score = (len(repopkg.depends) + len(repopkg.optdepends) - + len(repopkg.conflicts) + len(repopkg.provides) - + len(repopkg.replaces) + len(repopkg.groups) - + len(repopkg.license)) - if related_score: - db_score += (related_score / 20) + 1 - return db_score +pkg_same_version = lambda pkg, dbpkg: pkg.ver == dbpkg.pkgver \ + and pkg.rel == dbpkg.pkgrel and pkg.epoch == dbpkg.epoch def populate_files(dbpkg, repopkg, force=False): if not force: - if dbpkg.pkgver != repopkg.ver or dbpkg.pkgrel != repopkg.rel \ - or dbpkg.epoch != repopkg.epoch: + if not pkg_same_version(repopkg, dbpkg): logger.info("DB version (%s) didn't match repo version " "(%s) for package %s, skipping file list addition", dbpkg.full_version, repopkg.full_version, dbpkg.pkgname) - return 0 + return if not dbpkg.files_last_update or not dbpkg.last_update: pass elif dbpkg.files_last_update > dbpkg.last_update: - return 0 + return + # only delete files if we are reading a DB that contains them if repopkg.has_files: dbpkg.packagefile_set.all().delete() @@ -275,30 +269,19 @@ def populate_files(dbpkg, repopkg, force=False): pkgfile.save(force_insert=True) dbpkg.files_last_update = datetime.utcnow() dbpkg.save() - return (len(repopkg.files) / 50) + 1 - return 0 - -class Batcher(object): - def __init__(self, threshold, start=0): - self.threshold = threshold - self.meter = start - def batch_commit(self, score): - """ - Track updates to the database and perform a commit if the batch - becomes sufficiently large. "Large" is defined by waiting for the - sum of scores to exceed the arbitrary threshold value; once it is - hit a commit is issued. - """ - self.meter += score - if self.meter > self.threshold: - logger.debug("Committing transaction, batch threshold hit") - transaction.commit() - self.meter = 0 +def select_pkg_for_update(dbpkg): + database = router.db_for_write(Package, instance=dbpkg) + connection = connections[database] + if 'sqlite' in connection.settings_dict['ENGINE'].lower(): + return dbpkg + new_pkg = Package.objects.raw( + 'SELECT * FROM packages WHERE id = %s FOR UPDATE', + [dbpkg.id]) + return list(new_pkg)[0] -@transaction.commit_on_success def db_update(archname, reponame, pkgs, options): """ Parses a list and updates the Arch dev database accordingly. @@ -310,88 +293,111 @@ def db_update(archname, reponame, pkgs, options): logger.info('Updating Arch: %s', archname) force = options.get('force', False) filesonly = options.get('filesonly', False) - repository = Repo.objects.get(name__iexact=reponame) - architecture = Arch.objects.get(name__iexact=archname) - # no-arg order_by() removes even the default ordering; we don't need it - dbpkgs = Package.objects.filter( - arch=architecture, repo=repository).order_by() - # This makes our inner loop where we find packages by name *way* more - # efficient by not having to go to the database for each package to - # SELECT them by name. - dbdict = dict([(pkg.pkgname, pkg) for pkg in dbpkgs]) - - logger.debug("Creating sets") - dbset = set(dbdict.keys()) - syncset = set([pkg.name for pkg in pkgs]) - logger.info("%d packages in current web DB", len(dbset)) - logger.info("%d packages in new updating db", len(syncset)) - in_sync_not_db = syncset - dbset - logger.info("%d packages in sync not db", len(in_sync_not_db)) - - # Try to catch those random package deletions that make Eric so unhappy. - if len(dbset): - dbpercent = 100.0 * len(syncset) / len(dbset) - else: - dbpercent = 0.0 - logger.info("DB package ratio: %.1f%%", dbpercent) - - # Fewer than 20 packages makes the percentage check unreliable, but it also - # means we expect the repo to fluctuate a lot. - msg = "Package database has %.1f%% the number of packages in the " \ - "web database" % dbpercent - if len(dbset) == 0 and len(syncset) == 0: - pass - elif not filesonly and \ - len(dbset) > 20 and dbpercent < 50.0 and \ - not repository.testing and not repository.staging: - logger.error(msg) - raise Exception(msg) - elif dbpercent < 75.0: - logger.warning(msg) - - batcher = Batcher(100) + + with transaction.commit_manually(): + repository = Repo.objects.get(name__iexact=reponame) + architecture = Arch.objects.get(name__iexact=archname) + # no-arg order_by() removes even the default ordering; we don't need it + dbpkgs = Package.objects.filter( + arch=architecture, repo=repository).order_by() + # This makes our inner loop where we find packages by name *way* more + # efficient by not having to go to the database for each package to + # SELECT them by name. + dbdict = dict((dbpkg.pkgname, dbpkg) for dbpkg in dbpkgs) + + logger.debug("Creating sets") + dbset = set(dbdict.keys()) + syncset = set([pkg.name for pkg in pkgs]) + logger.info("%d packages in current web DB", len(dbset)) + logger.info("%d packages in new updating db", len(syncset)) + in_sync_not_db = syncset - dbset + logger.info("%d packages in sync not db", len(in_sync_not_db)) + + # Try to catch those random package deletions that make Eric so unhappy. + if len(dbset): + dbpercent = 100.0 * len(syncset) / len(dbset) + else: + dbpercent = 0.0 + logger.info("DB package ratio: %.1f%%", dbpercent) + + # Fewer than 20 packages makes the percentage check unreliable, but it also + # means we expect the repo to fluctuate a lot. + msg = "Package database has %.1f%% the number of packages in the " \ + "web database" % dbpercent + if len(dbset) == 0 and len(syncset) == 0: + pass + elif not filesonly and \ + len(dbset) > 20 and dbpercent < 50.0 and \ + not repository.testing and not repository.staging: + logger.error(msg) + raise Exception(msg) + elif dbpercent < 75.0: + logger.warning(msg) + + # If isolation level is repeatable-read, we need to ensure each package + # update starts a new transaction and re-queries the database as necessary + # to guard against simultaneous updates + transaction.commit() if not filesonly: # packages in syncdb and not in database (add to database) - for p in [x for x in pkgs if x.name in in_sync_not_db]: - logger.info("Adding package %s", p.name) - pkg = Package(pkgname=p.name, arch=architecture, repo=repository) - score = populate_pkg(pkg, p, timestamp=datetime.utcnow()) - batcher.batch_commit(score) + for pkg in (pkg for pkg in pkgs if pkg.name in in_sync_not_db): + logger.info("Adding package %s", pkg.name) + dbpkg = Package(pkgname=pkg.name, arch=architecture, repo=repository) + try: + with transaction.commit_on_success(): + populate_pkg(dbpkg, pkg, timestamp=datetime.utcnow()) + except IntegrityError: + logger.warning("Could not add package %s; " + "not fatal if another thread beat us to it.", + pkg.name, exc_info=True) # packages in database and not in syncdb (remove from database) - in_db_not_sync = dbset - syncset - for p in in_db_not_sync: - logger.info("Removing package %s", p) - dbp = dbdict[p] - dbp.delete() - batcher.batch_commit(1) + for pkgname in (dbset - syncset): + logger.info("Removing package %s", pkgname) + dbpkg = dbdict[pkgname] + with transaction.commit_on_success(): + # no race condition here as long as simultaneous threads both + # issue deletes; second delete will be a no-op + dbpkg.delete() # packages in both database and in syncdb (update in database) pkg_in_both = syncset & dbset - for p in [x for x in pkgs if x.name in pkg_in_both]: - logger.debug("Checking package %s", p.name) - dbp = dbdict[p.name] + for pkg in (x for x in pkgs if x.name in pkg_in_both): + logger.debug("Checking package %s", pkg.name) + dbpkg = dbdict[pkg.name] timestamp = None # for a force, we don't want to update the timestamp. # for a non-force, we don't want to do anything at all. if filesonly: pass - elif p.ver == dbp.pkgver and p.rel == dbp.pkgrel \ - and p.epoch == dbp.epoch: + elif pkg_same_version(pkg, dbpkg): if not force: continue else: timestamp = datetime.utcnow() + # The odd select_for_update song and dance here are to ensure + # simultaneous updates don't happen on a package, causing + # files/depends/all related items to be double-imported. if filesonly: - logger.debug("Checking files for package %s", p.name) - score = populate_files(dbp, p, force=force) + with transaction.commit_on_success(): + # TODO Django 1.4 select_for_update() will work once released + dbpkg = select_pkg_for_update(dbpkg) + if pkg_same_version(pkg, dbpkg): + logger.debug("Package %s was already updated", pkg.name) + continue + logger.debug("Checking files for package %s", pkg.name) + populate_files(dbpkg, pkg, force=force) else: - logger.info("Updating package %s", p.name) - score = populate_pkg(dbp, p, force=force, timestamp=timestamp) - - batcher.batch_commit(score) + with transaction.commit_on_success(): + # TODO Django 1.4 select_for_update() will work once released + dbpkg = select_pkg_for_update(dbpkg) + if pkg_same_version(pkg, dbpkg): + logger.debug("Package %s was already updated", pkg.name) + continue + logger.info("Updating package %s", pkg.name) + populate_pkg(dbpkg, pkg, force=force, timestamp=timestamp) logger.info('Finished updating Arch: %s', archname) -- cgit v1.2.3-2-g168b From 2cb4f97bb235217d6e56deded1444f5e84f08b71 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Thu, 17 Nov 2011 13:36:27 -0600 Subject: reporead: don't trim pkgdesc length Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index b6bd8457..cf101d97 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -73,7 +73,7 @@ class Command(BaseCommand): class Pkg(object): """An interim 'container' object for holding Arch package data.""" - bare = ( 'name', 'base', 'arch', 'desc', 'filename', + bare = ( 'name', 'base', 'arch', 'filename', 'md5sum', 'sha256sum', 'url', 'packager' ) number = ( 'csize', 'isize' ) collections = ( 'depends', 'optdepends', 'conflicts', @@ -101,8 +101,8 @@ class Pkg(object): setattr(self, k, v[0][:254]) elif k in self.number: setattr(self, k, long(v[0])) - elif k == 'pgpsig': - # do NOT prune this value at all + elif k in ('desc', 'pgpsig'): + # do NOT prune these values at all setattr(self, k, v[0]) elif k == 'version': match = self.version_re.match(v[0]) -- cgit v1.2.3-2-g168b