From 5fe626c6cc8444603cf7ae5199271b69d38ff255 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Thu, 9 Jun 2011 16:17:42 -0500 Subject: Management command cleanup Now that we aren't seeing odd segfaults and hung tasks, we can remove the traceback stuff from the scripts. Also use the 'io' module only, it has been long enough. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index a8875c7e..1adc359e 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -18,7 +18,7 @@ from django.contrib.auth.models import User from django.db import transaction from django.db.models import Q -import codecs +import io import os import re import sys @@ -27,14 +27,6 @@ import logging from datetime import datetime from optparse import make_option -# New in 2.6, but fast (C implementation) in 2.7. We will use it over codecs if -# available. Eventually remove the codecs import completely. -io = None -try: - import io -except ImportError: - pass - from main.models import Arch, Package, PackageDepend, PackageFile, Repo from packages.models import Conflict, Provision, Replacement @@ -74,11 +66,6 @@ class Command(BaseCommand): elif v == 2: logger.level = logging.DEBUG - import signal, traceback - handler = lambda sig, stack: traceback.print_stack(stack) - signal.signal(signal.SIGQUIT, handler) - signal.signal(signal.SIGUSR1, handler) - return read_repo(arch, filename, options) @@ -101,8 +88,7 @@ class Pkg(object): setattr(self, k, None) for k in self.collections: setattr(self, k, ()) - # So we can tell the diffence between a package with no files, and a DB - # without files entries + self.files = None self.has_files = False def populate(self, values): @@ -461,16 +447,13 @@ def parse_repo(repopath): if fname not in dbfiles: continue data_file = repodb.extractfile(tarinfo) - if io is None: - data_file = codecs.EncodedFile(data_file, 'utf-8') - else: - data_file = io.TextIOWrapper(io.BytesIO(data_file.read()), - encoding='utf=8') + data_file = io.TextIOWrapper(io.BytesIO(data_file.read()), + encoding='utf=8') try: data = parse_info(data_file) p = pkgs.setdefault(pkgid, Pkg(reponame)) p.populate(data) - except UnicodeDecodeError, e: + except UnicodeDecodeError: logger.warn("Could not correctly decode %s, skipping file", tarinfo.name) data_file.close() -- cgit v1.2.3-2-g168b From 895f8a20d35a18f3a0cc6e1530eb40292270fc7c Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Thu, 9 Jun 2011 17:25:32 -0500 Subject: reporead: allow batching of package updates The real reason I originally added transactions to this code was to prevent half-updates; e.g. a package gets in without the matching depends values. We can safely commit between packages and resume processing the database at a later time. Take advantage of this fact and commit every so often in batch fashion if we have a lot of updates piling up. In the case of updating the files DB, this can really cut down on the need to hold open a long-running, statement heavy transaction and get the information public faster. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 58 +++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 9 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index 1adc359e..e9878c93 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -221,6 +221,8 @@ def create_multivalued(dbpkg, repopkg, db_attr, repo_attr): collection.create(name=name) def populate_pkg(dbpkg, repopkg, force=False, timestamp=None): + db_score = 1 + if repopkg.base: dbpkg.pkgbase = repopkg.base else: @@ -251,7 +253,7 @@ def populate_pkg(dbpkg, repopkg, force=False, timestamp=None): dbpkg.last_update = timestamp dbpkg.save() - populate_files(dbpkg, repopkg, force=force) + db_score += populate_files(dbpkg, repopkg, force=force) dbpkg.packagedepend_set.all().delete() for y in repopkg.depends: @@ -272,6 +274,15 @@ def populate_pkg(dbpkg, repopkg, force=False, timestamp=None): create_multivalued(dbpkg, repopkg, 'groups', 'groups') create_multivalued(dbpkg, repopkg, 'licenses', 'license') + related_score = (len(repopkg.depends) + len(repopkg.optdepends) + + len(repopkg.conflicts) + len(repopkg.provides) + + len(repopkg.replaces) + len(repopkg.groups) + + len(repopkg.license)) + if related_score: + db_score += (related_score / 20) + 1 + + return db_score + def populate_files(dbpkg, repopkg, force=False): if not force: @@ -280,11 +291,11 @@ def populate_files(dbpkg, repopkg, force=False): logger.info("DB version (%s) didn't match repo version " "(%s) for package %s, skipping file list addition", dbpkg.full_version, repopkg.full_version, dbpkg.pkgname) - return + return 0 if not dbpkg.files_last_update or not dbpkg.last_update: pass elif dbpkg.files_last_update > dbpkg.last_update: - return + return 0 # only delete files if we are reading a DB that contains them if repopkg.has_files: dbpkg.packagefile_set.all().delete() @@ -303,6 +314,28 @@ def populate_files(dbpkg, repopkg, force=False): pkgfile.save(force_insert=True) dbpkg.files_last_update = datetime.utcnow() dbpkg.save() + return (len(repopkg.files) / 50) + 1 + return 0 + + +class Batcher(object): + def __init__(self, threshold, start=0): + self.threshold = threshold + self.meter = start + + def batch_commit(self, score): + """ + Track updates to the database and perform a commit if the batch + becomes sufficiently large. "Large" is defined by waiting for the + sum of scores to exceed the arbitrary threshold value; once it is + hit a commit is issued. + """ + self.meter += score + if self.meter > self.threshold: + logger.debug("Committing transaction, batch threshold hit") + transaction.commit() + self.meter = 0 + @transaction.commit_on_success def db_update(archname, reponame, pkgs, options): @@ -355,19 +388,23 @@ def db_update(archname, reponame, pkgs, options): elif dbpercent < 75.0: logger.warning(msg) + batcher = Batcher(100) + if not filesonly: # packages in syncdb and not in database (add to database) for p in [x for x in pkgs if x.name in in_sync_not_db]: logger.info("Adding package %s", p.name) pkg = Package(pkgname = p.name, arch = architecture, repo = repository) - populate_pkg(pkg, p, timestamp=datetime.utcnow()) + score = populate_pkg(pkg, p, timestamp=datetime.utcnow()) + batcher.batch_commit(score) # packages in database and not in syncdb (remove from database) in_db_not_sync = dbset - syncset for p in in_db_not_sync: - logger.info("Removing package %s from database", p) + logger.info("Removing package %s", p) dbp = dbdict[p] dbp.delete() + batcher.batch_commit(score) # packages in both database and in syncdb (update in database) pkg_in_both = syncset & dbset @@ -385,12 +422,15 @@ def db_update(archname, reponame, pkgs, options): continue else: timestamp = datetime.utcnow() + if filesonly: - logger.debug("Checking files for package %s in database", p.name) - populate_files(dbp, p, force=force) + logger.debug("Checking files for package %s", p.name) + score = populate_files(dbp, p, force=force) else: - logger.info("Updating package %s in database", p.name) - populate_pkg(dbp, p, force=force, timestamp=timestamp) + logger.info("Updating package %s", p.name) + score = populate_pkg(dbp, p, force=force, timestamp=timestamp) + + batcher.batch_commit(score) logger.info('Finished updating Arch: %s', archname) -- cgit v1.2.3-2-g168b From 01b07b5b07cd152949c9f01fec91408945273583 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Thu, 9 Jun 2011 17:30:50 -0500 Subject: Fix busted batch score on package removal Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index e9878c93..0bd5587a 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -404,7 +404,7 @@ def db_update(archname, reponame, pkgs, options): logger.info("Removing package %s", p) dbp = dbdict[p] dbp.delete() - batcher.batch_commit(score) + batcher.batch_commit(1) # packages in both database and in syncdb (update in database) pkg_in_both = syncset & dbset -- cgit v1.2.3-2-g168b From 92dbad587ab77b84130b86153464647d583b677e Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Fri, 10 Jun 2011 10:43:57 -0500 Subject: reporead: two small cleanups * Parse builddate when reading from repo database file * Use defaultdict where it comes in handy Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index 0bd5587a..4d30388e 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -18,6 +18,7 @@ from django.contrib.auth.models import User from django.db import transaction from django.db.models import Q +from collections import defaultdict import io import os import re @@ -72,7 +73,7 @@ class Command(BaseCommand): class Pkg(object): """An interim 'container' object for holding Arch package data.""" bare = ( 'name', 'base', 'arch', 'desc', 'filename', - 'md5sum', 'url', 'builddate', 'packager' ) + 'md5sum', 'url', 'packager' ) number = ( 'csize', 'isize' ) collections = ( 'depends', 'optdepends', 'conflicts', 'provides', 'replaces', 'groups', 'license', 'files' ) @@ -104,6 +105,16 @@ class Pkg(object): self.rel = match.group(4) if match.group(2): self.epoch = int(match.group(2)) + elif k == 'builddate': + try: + self.builddate = datetime.utcfromtimestamp(int(v[0])) + except ValueError: + try: + self.builddate = datetime.strptime(v[0], + '%a %b %d %H:%M:%S %Y') + except ValueError: + logger.warning('Package %s had unparsable build date %s', + self.name, v[0]) elif k == 'files': self.files = v self.has_files = True @@ -235,15 +246,7 @@ def populate_pkg(dbpkg, repopkg, force=False, timestamp=None): dbpkg.filename = repopkg.filename dbpkg.compressed_size = repopkg.csize dbpkg.installed_size = repopkg.isize - try: - dbpkg.build_date = datetime.utcfromtimestamp(int(repopkg.builddate)) - except ValueError: - try: - dbpkg.build_date = datetime.strptime(repopkg.builddate, - '%a %b %d %H:%M:%S %Y') - except ValueError: - logger.warning('Package %s had unparsable build date %s', - repopkg.name, repopkg.builddate) + dbpkg.build_date = repopkg.builddate dbpkg.packager_str = repopkg.packager # attempt to find the corresponding django user for this string dbpkg.packager = find_user(repopkg.packager) @@ -394,7 +397,7 @@ def db_update(archname, reponame, pkgs, options): # packages in syncdb and not in database (add to database) for p in [x for x in pkgs if x.name in in_sync_not_db]: logger.info("Adding package %s", p.name) - pkg = Package(pkgname = p.name, arch = architecture, repo = repository) + pkg = Package(pkgname=p.name, arch=architecture, repo=repository) score = populate_pkg(pkg, p, timestamp=datetime.utcnow()) batcher.batch_commit(score) @@ -480,7 +483,8 @@ def parse_repo(repopath): repodb = tarfile.open(repopath, "r") logger.debug("Starting package parsing") dbfiles = ('desc', 'depends', 'files') - pkgs = {} + newpkg = lambda: Pkg(reponame) + pkgs = defaultdict(newpkg) for tarinfo in repodb.getmembers(): if tarinfo.isreg(): pkgid, fname = os.path.split(tarinfo.name) @@ -490,9 +494,7 @@ def parse_repo(repopath): data_file = io.TextIOWrapper(io.BytesIO(data_file.read()), encoding='utf=8') try: - data = parse_info(data_file) - p = pkgs.setdefault(pkgid, Pkg(reponame)) - p.populate(data) + pkgs[pkgid].populate(parse_info(data_file)) except UnicodeDecodeError: logger.warn("Could not correctly decode %s, skipping file", tarinfo.name) -- cgit v1.2.3-2-g168b From b336dd15598132d1c501a9d44bc4d5a0e64bfb2e Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Fri, 10 Jun 2011 10:46:06 -0500 Subject: reporead: small memory/perf improvements Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index 4d30388e..baf7fee1 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -116,11 +116,11 @@ class Pkg(object): logger.warning('Package %s had unparsable build date %s', self.name, v[0]) elif k == 'files': - self.files = v + self.files = tuple(v) self.has_files = True else: # anything left in collections - setattr(self, k, v) + setattr(self, k, tuple(v)) @property def full_version(self): @@ -528,7 +528,9 @@ def read_repo(primary_arch, repo_file, options): else: # we don't include mis-arched packages logger.warning("Package %s arch = %s", - package.name,package.arch) + package.name, package.arch) + del packages + logger.info('Starting database updates.') for arch in sorted(packages_arches.keys()): db_update(arch, repo, packages_arches[arch], options) -- cgit v1.2.3-2-g168b From da20949c8cc185e91dbaae1b8369fcffa3447081 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Thu, 23 Jun 2011 19:11:12 -0500 Subject: Move find_user method to devel utils This could be handy elsewhere as well, and it is loosely coupled to anything else in reporead. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 51 +---------------------------------- 1 file changed, 1 insertion(+), 50 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index baf7fee1..138931ff 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -16,7 +16,6 @@ Example: from django.core.management.base import BaseCommand, CommandError from django.contrib.auth.models import User from django.db import transaction -from django.db.models import Q from collections import defaultdict import io @@ -28,6 +27,7 @@ import logging from datetime import datetime from optparse import make_option +from devel.utils import find_user from main.models import Arch, Package, PackageDepend, PackageFile, Repo from packages.models import Conflict, Provision, Replacement @@ -130,55 +130,6 @@ class Pkg(object): return u'%s-%s' % (self.ver, self.rel) -def find_user(userstring): - ''' - Attempt to find the corresponding User object for a standard - packager string, e.g. something like - 'A. U. Thor '. - We start by searching for a matching email address; we then move onto - matching by first/last name. If we cannot find a user, then return None. - ''' - if userstring in find_user.cache: - return find_user.cache[userstring] - matches = re.match(r'^([^<]+)? ?<([^>]*)>', userstring) - if not matches: - return None - - user = None - name = matches.group(1) - email = matches.group(2) - - def user_email(): - return User.objects.get(email=email) - def profile_email(): - return User.objects.get(userprofile__public_email=email) - def user_name(): - # yes, a bit odd but this is the easiest way since we can't always be - # sure how to split the name. Ensure every 'token' appears in at least - # one of the two name fields. - name_q = Q() - for token in name.split(): - # ignore quoted parts; e.g. nicknames in strings - if re.match(r'^[\'"].*[\'"]$', token): - continue - name_q &= (Q(first_name__icontains=token) | - Q(last_name__icontains=token)) - return User.objects.get(name_q) - - for matcher in (user_email, profile_email, user_name): - try: - user = matcher() - break - except (User.DoesNotExist, User.MultipleObjectsReturned): - pass - - find_user.cache[userstring] = user - return user - -# cached mappings of user strings -> User objects so we don't have to do the -# lookup more than strictly necessary. -find_user.cache = {} - DEPEND_RE = re.compile(r"^(.+?)((>=|<=|=|>|<)(.*))?$") def create_depend(package, dep_str, optional=False): -- cgit v1.2.3-2-g168b From 9156003d2d93de57c663901c39ac66316a3d969e Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Thu, 23 Jun 2011 19:50:46 -0500 Subject: Turn find_user into UserFinder class This moves the cache inside an instance. Also add a few more tests. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index 138931ff..470b785d 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -27,7 +27,7 @@ import logging from datetime import datetime from optparse import make_option -from devel.utils import find_user +from devel.utils import UserFinder from main.models import Arch, Package, PackageDepend, PackageFile, Repo from packages.models import Conflict, Provision, Replacement @@ -182,6 +182,8 @@ def create_multivalued(dbpkg, repopkg, db_attr, repo_attr): for name in getattr(repopkg, repo_attr): collection.create(name=name) +finder = UserFinder() + def populate_pkg(dbpkg, repopkg, force=False, timestamp=None): db_score = 1 @@ -200,7 +202,7 @@ def populate_pkg(dbpkg, repopkg, force=False, timestamp=None): dbpkg.build_date = repopkg.builddate dbpkg.packager_str = repopkg.packager # attempt to find the corresponding django user for this string - dbpkg.packager = find_user(repopkg.packager) + dbpkg.packager = finder.find(repopkg.packager) if timestamp: dbpkg.flag_date = None -- cgit v1.2.3-2-g168b From b6f86d9ab0d6910f0f70398b07e965d337bd9e78 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Tue, 16 Aug 2011 16:04:16 -0500 Subject: Add two new DB fields to reporead Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index 470b785d..97fdbb73 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -73,7 +73,7 @@ class Command(BaseCommand): class Pkg(object): """An interim 'container' object for holding Arch package data.""" bare = ( 'name', 'base', 'arch', 'desc', 'filename', - 'md5sum', 'url', 'packager' ) + 'md5sum', 'sha256sum', 'pgpsig', 'url', 'packager' ) number = ( 'csize', 'isize' ) collections = ( 'depends', 'optdepends', 'conflicts', 'provides', 'replaces', 'groups', 'license', 'files' ) -- cgit v1.2.3-2-g168b From e5d09fb7e9003b7f96685af9c0a722b45746448e Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Wed, 17 Aug 2011 16:18:12 -0500 Subject: Add PGP signature package field And add eventual display code for it to the details template, but don't show it yet as no packages will have it. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 1 + 1 file changed, 1 insertion(+) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index 97fdbb73..cf597577 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -203,6 +203,7 @@ def populate_pkg(dbpkg, repopkg, force=False, timestamp=None): dbpkg.packager_str = repopkg.packager # attempt to find the corresponding django user for this string dbpkg.packager = finder.find(repopkg.packager) + dbpkg.pgp_signature = repopkg.pgpsig if timestamp: dbpkg.flag_date = None -- cgit v1.2.3-2-g168b From 1b83844b30d3271b8fb50757d827c7b8fe8b5585 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Wed, 26 Oct 2011 03:25:15 -0500 Subject: Ensure PGP signature values are not trimmed This makes them totally unusable for any real purpose down the road. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index cf597577..a8e3219e 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -73,7 +73,7 @@ class Command(BaseCommand): class Pkg(object): """An interim 'container' object for holding Arch package data.""" bare = ( 'name', 'base', 'arch', 'desc', 'filename', - 'md5sum', 'sha256sum', 'pgpsig', 'url', 'packager' ) + 'md5sum', 'sha256sum', 'url', 'packager' ) number = ( 'csize', 'isize' ) collections = ( 'depends', 'optdepends', 'conflicts', 'provides', 'replaces', 'groups', 'license', 'files' ) @@ -85,6 +85,7 @@ class Pkg(object): self.ver = None self.rel = None self.epoch = 0 + self.pgpsig = None for k in self.bare + self.number: setattr(self, k, None) for k in self.collections: @@ -99,6 +100,9 @@ class Pkg(object): setattr(self, k, v[0][:254]) elif k in self.number: setattr(self, k, long(v[0])) + elif k == 'pgpsig': + # do NOT prune this value at all + setattr(self, k, v[0]) elif k == 'version': match = self.version_re.match(v[0]) self.ver = match.group(3) -- cgit v1.2.3-2-g168b From 9550236a87fc65827e994bea108350a43d3f161f Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Tue, 15 Nov 2011 21:26:57 -0600 Subject: Improve primary arch validation Ensure we can accept either a Arch object or an architecture name when passed to read_repo() by moving the validation there and being a bit more careful about typechecking and object lookup. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index a8e3219e..b4966834 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -51,8 +51,6 @@ class Command(BaseCommand): def handle(self, arch=None, filename=None, **options): if not arch: raise CommandError('Architecture is required.') - if not validate_arch(arch): - raise CommandError('Specified architecture %s is not currently known.' % arch) if not filename: raise CommandError('Package database file is required.') filename = os.path.normpath(filename) @@ -464,29 +462,38 @@ def parse_repo(repopath): logger.info("Finished repo parsing, %d total packages", len(pkgs)) return (reponame, pkgs.values()) -def validate_arch(archname): +def locate_arch(arch): "Check if arch is valid." - return Arch.objects.filter(name__iexact=archname).exists() + if isinstance(arch, Arch): + return arch + try: + return Arch.objects.get(name__iexact=arch) + except Arch.DoesNotExist: + raise CommandError( + 'Specified architecture %s is not currently known.' % arch) + def read_repo(primary_arch, repo_file, options): """ Parses repo.db.tar.gz file and returns exit status. """ + # always returns an Arch object, regardless of what is passed in + primary_arch = locate_arch(primary_arch) + repo, packages = parse_repo(repo_file) # group packages by arch -- to handle noarch stuff packages_arches = {} for arch in Arch.objects.filter(agnostic=True): packages_arches[arch.name] = [] - packages_arches[primary_arch] = [] + packages_arches[primary_arch.name] = [] for package in packages: if package.arch in packages_arches: packages_arches[package.arch].append(package) else: # we don't include mis-arched packages - logger.warning("Package %s arch = %s", - package.name, package.arch) + logger.warning("Package %s arch = %s", package.name, package.arch) del packages logger.info('Starting database updates.') -- cgit v1.2.3-2-g168b From c00e7e84045613ee2aa80f66b9972db971ab3f26 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Tue, 15 Nov 2011 23:43:42 -0600 Subject: reporead: clean up some debug logging Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index b4966834..45229524 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -36,6 +36,8 @@ logging.basicConfig( format='%(asctime)s -> %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', stream=sys.stderr) +TRACE = 5 +logging.addLevelName(TRACE, 'TRACE') logger = logging.getLogger() class Command(BaseCommand): @@ -368,7 +370,7 @@ def db_update(archname, reponame, pkgs, options): # packages in both database and in syncdb (update in database) pkg_in_both = syncset & dbset for p in [x for x in pkgs if x.name in pkg_in_both]: - logger.debug("Looking for package updates") + logger.debug("Checking package %s", p.name) dbp = dbdict[p.name] timestamp = None # for a force, we don't want to update the timestamp. @@ -406,7 +408,7 @@ def parse_info(iofile): continue elif line.startswith('%') and line.endswith('%'): blockname = line[1:-1].lower() - logger.debug("Parsing package block %s", blockname) + logger.log(TRACE, "Parsing package block %s", blockname) store[blockname] = [] elif blockname: store[blockname].append(line) @@ -456,7 +458,7 @@ def parse_repo(repopath): tarinfo.name) data_file.close() - logger.debug("Done parsing file %s", fname) + logger.debug("Done parsing file %s/%s", pkgid, fname) repodb.close() logger.info("Finished repo parsing, %d total packages", len(pkgs)) -- cgit v1.2.3-2-g168b From 404c4b400b2bd2a14e0363e33d66505c51903fe7 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Wed, 16 Nov 2011 12:48:36 -0600 Subject: reporead: a few small tweaks Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index 45229524..ad76db4d 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -450,13 +450,14 @@ def parse_repo(repopath): continue data_file = repodb.extractfile(tarinfo) data_file = io.TextIOWrapper(io.BytesIO(data_file.read()), - encoding='utf=8') + encoding='UTF-8') try: pkgs[pkgid].populate(parse_info(data_file)) except UnicodeDecodeError: logger.warn("Could not correctly decode %s, skipping file", tarinfo.name) data_file.close() + del data_file logger.debug("Done parsing file %s/%s", pkgid, fname) @@ -498,10 +499,10 @@ def read_repo(primary_arch, repo_file, options): logger.warning("Package %s arch = %s", package.name, package.arch) del packages - logger.info('Starting database updates.') + logger.info('Starting database updates for %s.', repo_file) for arch in sorted(packages_arches.keys()): db_update(arch, repo, packages_arches[arch], options) - logger.info('Finished database updates.') + logger.info('Finished database updates for %s.', repo_file) return 0 # vim: set ts=4 sw=4 et: -- cgit v1.2.3-2-g168b From a9819e3d715ce3e5c20c9665db9a6100f06ab562 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Thu, 17 Nov 2011 12:34:12 -0600 Subject: Ensure reporead is protected against simultaneous runs This adds a bunch of transaction magic and SELECT FOR UPDATE stuff to reporead to cope with the now-concurrent runs of reporead we get when invoked from our inotify-based updater. The collision occurs with 'any' architecture packages as both repo databases contain the new version, and the updates occur at exactly the same time. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 206 +++++++++++++++++----------------- 1 file changed, 106 insertions(+), 100 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index ad76db4d..b6bd8457 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -13,10 +13,6 @@ Example: ./manage.py reporead i686 /tmp/core.db.tar.gz """ -from django.core.management.base import BaseCommand, CommandError -from django.contrib.auth.models import User -from django.db import transaction - from collections import defaultdict import io import os @@ -27,6 +23,11 @@ import logging from datetime import datetime from optparse import make_option +from django.core.management.base import BaseCommand, CommandError +from django.contrib.auth.models import User +from django.db import connections, router, transaction +from django.db.utils import IntegrityError + from devel.utils import UserFinder from main.models import Arch, Package, PackageDepend, PackageFile, Repo from packages.models import Conflict, Provision, Replacement @@ -189,8 +190,6 @@ def create_multivalued(dbpkg, repopkg, db_attr, repo_attr): finder = UserFinder() def populate_pkg(dbpkg, repopkg, force=False, timestamp=None): - db_score = 1 - if repopkg.base: dbpkg.pkgbase = repopkg.base else: @@ -214,7 +213,7 @@ def populate_pkg(dbpkg, repopkg, force=False, timestamp=None): dbpkg.last_update = timestamp dbpkg.save() - db_score += populate_files(dbpkg, repopkg, force=force) + populate_files(dbpkg, repopkg, force=force) dbpkg.packagedepend_set.all().delete() for y in repopkg.depends: @@ -235,28 +234,23 @@ def populate_pkg(dbpkg, repopkg, force=False, timestamp=None): create_multivalued(dbpkg, repopkg, 'groups', 'groups') create_multivalued(dbpkg, repopkg, 'licenses', 'license') - related_score = (len(repopkg.depends) + len(repopkg.optdepends) - + len(repopkg.conflicts) + len(repopkg.provides) - + len(repopkg.replaces) + len(repopkg.groups) - + len(repopkg.license)) - if related_score: - db_score += (related_score / 20) + 1 - return db_score +pkg_same_version = lambda pkg, dbpkg: pkg.ver == dbpkg.pkgver \ + and pkg.rel == dbpkg.pkgrel and pkg.epoch == dbpkg.epoch def populate_files(dbpkg, repopkg, force=False): if not force: - if dbpkg.pkgver != repopkg.ver or dbpkg.pkgrel != repopkg.rel \ - or dbpkg.epoch != repopkg.epoch: + if not pkg_same_version(repopkg, dbpkg): logger.info("DB version (%s) didn't match repo version " "(%s) for package %s, skipping file list addition", dbpkg.full_version, repopkg.full_version, dbpkg.pkgname) - return 0 + return if not dbpkg.files_last_update or not dbpkg.last_update: pass elif dbpkg.files_last_update > dbpkg.last_update: - return 0 + return + # only delete files if we are reading a DB that contains them if repopkg.has_files: dbpkg.packagefile_set.all().delete() @@ -275,30 +269,19 @@ def populate_files(dbpkg, repopkg, force=False): pkgfile.save(force_insert=True) dbpkg.files_last_update = datetime.utcnow() dbpkg.save() - return (len(repopkg.files) / 50) + 1 - return 0 - -class Batcher(object): - def __init__(self, threshold, start=0): - self.threshold = threshold - self.meter = start - def batch_commit(self, score): - """ - Track updates to the database and perform a commit if the batch - becomes sufficiently large. "Large" is defined by waiting for the - sum of scores to exceed the arbitrary threshold value; once it is - hit a commit is issued. - """ - self.meter += score - if self.meter > self.threshold: - logger.debug("Committing transaction, batch threshold hit") - transaction.commit() - self.meter = 0 +def select_pkg_for_update(dbpkg): + database = router.db_for_write(Package, instance=dbpkg) + connection = connections[database] + if 'sqlite' in connection.settings_dict['ENGINE'].lower(): + return dbpkg + new_pkg = Package.objects.raw( + 'SELECT * FROM packages WHERE id = %s FOR UPDATE', + [dbpkg.id]) + return list(new_pkg)[0] -@transaction.commit_on_success def db_update(archname, reponame, pkgs, options): """ Parses a list and updates the Arch dev database accordingly. @@ -310,88 +293,111 @@ def db_update(archname, reponame, pkgs, options): logger.info('Updating Arch: %s', archname) force = options.get('force', False) filesonly = options.get('filesonly', False) - repository = Repo.objects.get(name__iexact=reponame) - architecture = Arch.objects.get(name__iexact=archname) - # no-arg order_by() removes even the default ordering; we don't need it - dbpkgs = Package.objects.filter( - arch=architecture, repo=repository).order_by() - # This makes our inner loop where we find packages by name *way* more - # efficient by not having to go to the database for each package to - # SELECT them by name. - dbdict = dict([(pkg.pkgname, pkg) for pkg in dbpkgs]) - - logger.debug("Creating sets") - dbset = set(dbdict.keys()) - syncset = set([pkg.name for pkg in pkgs]) - logger.info("%d packages in current web DB", len(dbset)) - logger.info("%d packages in new updating db", len(syncset)) - in_sync_not_db = syncset - dbset - logger.info("%d packages in sync not db", len(in_sync_not_db)) - - # Try to catch those random package deletions that make Eric so unhappy. - if len(dbset): - dbpercent = 100.0 * len(syncset) / len(dbset) - else: - dbpercent = 0.0 - logger.info("DB package ratio: %.1f%%", dbpercent) - - # Fewer than 20 packages makes the percentage check unreliable, but it also - # means we expect the repo to fluctuate a lot. - msg = "Package database has %.1f%% the number of packages in the " \ - "web database" % dbpercent - if len(dbset) == 0 and len(syncset) == 0: - pass - elif not filesonly and \ - len(dbset) > 20 and dbpercent < 50.0 and \ - not repository.testing and not repository.staging: - logger.error(msg) - raise Exception(msg) - elif dbpercent < 75.0: - logger.warning(msg) - - batcher = Batcher(100) + + with transaction.commit_manually(): + repository = Repo.objects.get(name__iexact=reponame) + architecture = Arch.objects.get(name__iexact=archname) + # no-arg order_by() removes even the default ordering; we don't need it + dbpkgs = Package.objects.filter( + arch=architecture, repo=repository).order_by() + # This makes our inner loop where we find packages by name *way* more + # efficient by not having to go to the database for each package to + # SELECT them by name. + dbdict = dict((dbpkg.pkgname, dbpkg) for dbpkg in dbpkgs) + + logger.debug("Creating sets") + dbset = set(dbdict.keys()) + syncset = set([pkg.name for pkg in pkgs]) + logger.info("%d packages in current web DB", len(dbset)) + logger.info("%d packages in new updating db", len(syncset)) + in_sync_not_db = syncset - dbset + logger.info("%d packages in sync not db", len(in_sync_not_db)) + + # Try to catch those random package deletions that make Eric so unhappy. + if len(dbset): + dbpercent = 100.0 * len(syncset) / len(dbset) + else: + dbpercent = 0.0 + logger.info("DB package ratio: %.1f%%", dbpercent) + + # Fewer than 20 packages makes the percentage check unreliable, but it also + # means we expect the repo to fluctuate a lot. + msg = "Package database has %.1f%% the number of packages in the " \ + "web database" % dbpercent + if len(dbset) == 0 and len(syncset) == 0: + pass + elif not filesonly and \ + len(dbset) > 20 and dbpercent < 50.0 and \ + not repository.testing and not repository.staging: + logger.error(msg) + raise Exception(msg) + elif dbpercent < 75.0: + logger.warning(msg) + + # If isolation level is repeatable-read, we need to ensure each package + # update starts a new transaction and re-queries the database as necessary + # to guard against simultaneous updates + transaction.commit() if not filesonly: # packages in syncdb and not in database (add to database) - for p in [x for x in pkgs if x.name in in_sync_not_db]: - logger.info("Adding package %s", p.name) - pkg = Package(pkgname=p.name, arch=architecture, repo=repository) - score = populate_pkg(pkg, p, timestamp=datetime.utcnow()) - batcher.batch_commit(score) + for pkg in (pkg for pkg in pkgs if pkg.name in in_sync_not_db): + logger.info("Adding package %s", pkg.name) + dbpkg = Package(pkgname=pkg.name, arch=architecture, repo=repository) + try: + with transaction.commit_on_success(): + populate_pkg(dbpkg, pkg, timestamp=datetime.utcnow()) + except IntegrityError: + logger.warning("Could not add package %s; " + "not fatal if another thread beat us to it.", + pkg.name, exc_info=True) # packages in database and not in syncdb (remove from database) - in_db_not_sync = dbset - syncset - for p in in_db_not_sync: - logger.info("Removing package %s", p) - dbp = dbdict[p] - dbp.delete() - batcher.batch_commit(1) + for pkgname in (dbset - syncset): + logger.info("Removing package %s", pkgname) + dbpkg = dbdict[pkgname] + with transaction.commit_on_success(): + # no race condition here as long as simultaneous threads both + # issue deletes; second delete will be a no-op + dbpkg.delete() # packages in both database and in syncdb (update in database) pkg_in_both = syncset & dbset - for p in [x for x in pkgs if x.name in pkg_in_both]: - logger.debug("Checking package %s", p.name) - dbp = dbdict[p.name] + for pkg in (x for x in pkgs if x.name in pkg_in_both): + logger.debug("Checking package %s", pkg.name) + dbpkg = dbdict[pkg.name] timestamp = None # for a force, we don't want to update the timestamp. # for a non-force, we don't want to do anything at all. if filesonly: pass - elif p.ver == dbp.pkgver and p.rel == dbp.pkgrel \ - and p.epoch == dbp.epoch: + elif pkg_same_version(pkg, dbpkg): if not force: continue else: timestamp = datetime.utcnow() + # The odd select_for_update song and dance here are to ensure + # simultaneous updates don't happen on a package, causing + # files/depends/all related items to be double-imported. if filesonly: - logger.debug("Checking files for package %s", p.name) - score = populate_files(dbp, p, force=force) + with transaction.commit_on_success(): + # TODO Django 1.4 select_for_update() will work once released + dbpkg = select_pkg_for_update(dbpkg) + if pkg_same_version(pkg, dbpkg): + logger.debug("Package %s was already updated", pkg.name) + continue + logger.debug("Checking files for package %s", pkg.name) + populate_files(dbpkg, pkg, force=force) else: - logger.info("Updating package %s", p.name) - score = populate_pkg(dbp, p, force=force, timestamp=timestamp) - - batcher.batch_commit(score) + with transaction.commit_on_success(): + # TODO Django 1.4 select_for_update() will work once released + dbpkg = select_pkg_for_update(dbpkg) + if pkg_same_version(pkg, dbpkg): + logger.debug("Package %s was already updated", pkg.name) + continue + logger.info("Updating package %s", pkg.name) + populate_pkg(dbpkg, pkg, force=force, timestamp=timestamp) logger.info('Finished updating Arch: %s', archname) -- cgit v1.2.3-2-g168b From 2cb4f97bb235217d6e56deded1444f5e84f08b71 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Thu, 17 Nov 2011 13:36:27 -0600 Subject: reporead: don't trim pkgdesc length Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index b6bd8457..cf101d97 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -73,7 +73,7 @@ class Command(BaseCommand): class Pkg(object): """An interim 'container' object for holding Arch package data.""" - bare = ( 'name', 'base', 'arch', 'desc', 'filename', + bare = ( 'name', 'base', 'arch', 'filename', 'md5sum', 'sha256sum', 'url', 'packager' ) number = ( 'csize', 'isize' ) collections = ( 'depends', 'optdepends', 'conflicts', @@ -101,8 +101,8 @@ class Pkg(object): setattr(self, k, v[0][:254]) elif k in self.number: setattr(self, k, long(v[0])) - elif k == 'pgpsig': - # do NOT prune this value at all + elif k in ('desc', 'pgpsig'): + # do NOT prune these values at all setattr(self, k, v[0]) elif k == 'version': match = self.version_re.match(v[0]) -- cgit v1.2.3-2-g168b From ac34e358103ee718369692b9ba5afe6830a1df92 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Wed, 30 Nov 2011 12:25:54 -0600 Subject: reporead: fix filesonly needs update checks This was broken after the select for update changes. We really should split the whole filesonly update into another method instead of the current shotgun approach with conditionals everywhere. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index cf101d97..f8cc2034 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -382,11 +382,13 @@ def db_update(archname, reponame, pkgs, options): # files/depends/all related items to be double-imported. if filesonly: with transaction.commit_on_success(): + if not dbpkg.files_last_update or not dbpkg.last_update: + pass + elif dbpkg.files_last_update > dbpkg.last_update: + logger.debug("Files for %s are up to date", pkg.name) + continue # TODO Django 1.4 select_for_update() will work once released dbpkg = select_pkg_for_update(dbpkg) - if pkg_same_version(pkg, dbpkg): - logger.debug("Package %s was already updated", pkg.name) - continue logger.debug("Checking files for package %s", pkg.name) populate_files(dbpkg, pkg, force=force) else: -- cgit v1.2.3-2-g168b From dca9fd2ea687b8d20fd5c39c1449846ddef1e3c2 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Wed, 30 Nov 2011 13:04:15 -0600 Subject: reporead: split out filesonly update method This removes a bunch of the conditional logic at a slight cost of some code duplication. However, the methods and madness is now much easier to follow. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 170 +++++++++++++++++++--------------- 1 file changed, 95 insertions(+), 75 deletions(-) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index f8cc2034..e4ba8580 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -282,40 +282,20 @@ def select_pkg_for_update(dbpkg): return list(new_pkg)[0] -def db_update(archname, reponame, pkgs, options): - """ - Parses a list and updates the Arch dev database accordingly. - - Arguments: - pkgs -- A list of Pkg objects. - - """ - logger.info('Updating Arch: %s', archname) - force = options.get('force', False) - filesonly = options.get('filesonly', False) - +def update_common(archname, reponame, pkgs, sanity_check=True): with transaction.commit_manually(): repository = Repo.objects.get(name__iexact=reponame) architecture = Arch.objects.get(name__iexact=archname) # no-arg order_by() removes even the default ordering; we don't need it dbpkgs = Package.objects.filter( arch=architecture, repo=repository).order_by() - # This makes our inner loop where we find packages by name *way* more - # efficient by not having to go to the database for each package to - # SELECT them by name. - dbdict = dict((dbpkg.pkgname, dbpkg) for dbpkg in dbpkgs) - - logger.debug("Creating sets") - dbset = set(dbdict.keys()) - syncset = set([pkg.name for pkg in pkgs]) - logger.info("%d packages in current web DB", len(dbset)) - logger.info("%d packages in new updating db", len(syncset)) - in_sync_not_db = syncset - dbset - logger.info("%d packages in sync not db", len(in_sync_not_db)) + + logger.info("%d packages in current web DB", len(dbpkgs)) + logger.info("%d packages in new updating DB", len(pkgs)) # Try to catch those random package deletions that make Eric so unhappy. - if len(dbset): - dbpercent = 100.0 * len(syncset) / len(dbset) + if len(dbpkgs): + dbpercent = 100.0 * len(pkgs) / len(dbpkgs) else: dbpercent = 0.0 logger.info("DB package ratio: %.1f%%", dbpercent) @@ -324,11 +304,13 @@ def db_update(archname, reponame, pkgs, options): # means we expect the repo to fluctuate a lot. msg = "Package database has %.1f%% the number of packages in the " \ "web database" % dbpercent - if len(dbset) == 0 and len(syncset) == 0: + if not sanity_check: + pass + elif repository.testing or repository.staging: pass - elif not filesonly and \ - len(dbset) > 20 and dbpercent < 50.0 and \ - not repository.testing and not repository.staging: + elif len(dbpkgs) == 0 and len(pkgs) == 0: + pass + elif len(dbpkgs) > 20 and dbpercent < 50.0: logger.error(msg) raise Exception(msg) elif dbpercent < 75.0: @@ -339,27 +321,45 @@ def db_update(archname, reponame, pkgs, options): # to guard against simultaneous updates transaction.commit() - if not filesonly: - # packages in syncdb and not in database (add to database) - for pkg in (pkg for pkg in pkgs if pkg.name in in_sync_not_db): - logger.info("Adding package %s", pkg.name) - dbpkg = Package(pkgname=pkg.name, arch=architecture, repo=repository) - try: - with transaction.commit_on_success(): - populate_pkg(dbpkg, pkg, timestamp=datetime.utcnow()) - except IntegrityError: - logger.warning("Could not add package %s; " - "not fatal if another thread beat us to it.", - pkg.name, exc_info=True) - - # packages in database and not in syncdb (remove from database) - for pkgname in (dbset - syncset): - logger.info("Removing package %s", pkgname) - dbpkg = dbdict[pkgname] + return dbpkgs + +def db_update(archname, reponame, pkgs, force=False): + """ + Parses a list of packages and updates the packages database accordingly. + """ + logger.info('Updating %s (%s)', reponame, archname) + dbpkgs = update_common(archname, reponame, pkgs, True) + + # This makes our inner loop where we find packages by name *way* more + # efficient by not having to go to the database for each package to + # SELECT them by name. + dbdict = dict((dbpkg.pkgname, dbpkg) for dbpkg in dbpkgs) + + dbset = set(dbdict.keys()) + syncset = set([pkg.name for pkg in pkgs]) + + in_sync_not_db = syncset - dbset + logger.info("%d packages in sync not db", len(in_sync_not_db)) + # packages in syncdb and not in database (add to database) + for pkg in (pkg for pkg in pkgs if pkg.name in in_sync_not_db): + logger.info("Adding package %s", pkg.name) + dbpkg = Package(pkgname=pkg.name, arch=architecture, repo=repository) + try: with transaction.commit_on_success(): - # no race condition here as long as simultaneous threads both - # issue deletes; second delete will be a no-op - dbpkg.delete() + populate_pkg(dbpkg, pkg, timestamp=datetime.utcnow()) + except IntegrityError: + logger.warning("Could not add package %s; " + "not fatal if another thread beat us to it.", + pkg.name, exc_info=True) + + # packages in database and not in syncdb (remove from database) + for pkgname in (dbset - syncset): + logger.info("Removing package %s", pkgname) + dbpkg = dbdict[pkgname] + with transaction.commit_on_success(): + # no race condition here as long as simultaneous threads both + # issue deletes; second delete will be a no-op + dbpkg.delete() # packages in both database and in syncdb (update in database) pkg_in_both = syncset & dbset @@ -369,9 +369,7 @@ def db_update(archname, reponame, pkgs, options): timestamp = None # for a force, we don't want to update the timestamp. # for a non-force, we don't want to do anything at all. - if filesonly: - pass - elif pkg_same_version(pkg, dbpkg): + if pkg_same_version(pkg, dbpkg): if not force: continue else: @@ -380,28 +378,45 @@ def db_update(archname, reponame, pkgs, options): # The odd select_for_update song and dance here are to ensure # simultaneous updates don't happen on a package, causing # files/depends/all related items to be double-imported. - if filesonly: - with transaction.commit_on_success(): - if not dbpkg.files_last_update or not dbpkg.last_update: - pass - elif dbpkg.files_last_update > dbpkg.last_update: - logger.debug("Files for %s are up to date", pkg.name) - continue - # TODO Django 1.4 select_for_update() will work once released - dbpkg = select_pkg_for_update(dbpkg) - logger.debug("Checking files for package %s", pkg.name) - populate_files(dbpkg, pkg, force=force) - else: - with transaction.commit_on_success(): - # TODO Django 1.4 select_for_update() will work once released - dbpkg = select_pkg_for_update(dbpkg) - if pkg_same_version(pkg, dbpkg): - logger.debug("Package %s was already updated", pkg.name) - continue - logger.info("Updating package %s", pkg.name) - populate_pkg(dbpkg, pkg, force=force, timestamp=timestamp) + with transaction.commit_on_success(): + # TODO Django 1.4 select_for_update() will work once released + dbpkg = select_pkg_for_update(dbpkg) + if pkg_same_version(pkg, dbpkg): + logger.debug("Package %s was already updated", pkg.name) + continue + logger.info("Updating package %s", pkg.name) + populate_pkg(dbpkg, pkg, force=force, timestamp=timestamp) + + logger.info('Finished updating arch: %s', archname) + + +def filesonly_update(archname, reponame, pkgs, force=False): + """ + Parses a list of packages and updates the packages database accordingly. + """ + logger.info('Updating files for %s (%s)', reponame, archname) + dbpkgs = update_common(archname, reponame, pkgs, False) + dbdict = dict((dbpkg.pkgname, dbpkg) for dbpkg in dbpkgs) + dbset = set(dbdict.keys()) + + for pkg in (pkg for pkg in pkgs if pkg.name in dbset): + dbpkg = dbdict[pkg.name] + + # The odd select_for_update song and dance here are to ensure + # simultaneous updates don't happen on a package, causing + # files to be double-imported. + with transaction.commit_on_success(): + if not dbpkg.files_last_update or not dbpkg.last_update: + pass + elif dbpkg.files_last_update > dbpkg.last_update: + logger.debug("Files for %s are up to date", pkg.name) + continue + # TODO Django 1.4 select_for_update() will work once released + dbpkg = select_pkg_for_update(dbpkg) + logger.debug("Checking files for package %s", pkg.name) + populate_files(dbpkg, pkg, force=force) - logger.info('Finished updating Arch: %s', archname) + logger.info('Finished updating arch: %s', archname) def parse_info(iofile): @@ -490,6 +505,8 @@ def read_repo(primary_arch, repo_file, options): """ # always returns an Arch object, regardless of what is passed in primary_arch = locate_arch(primary_arch) + force = options.get('force', False) + filesonly = options.get('filesonly', False) repo, packages = parse_repo(repo_file) @@ -509,7 +526,10 @@ def read_repo(primary_arch, repo_file, options): logger.info('Starting database updates for %s.', repo_file) for arch in sorted(packages_arches.keys()): - db_update(arch, repo, packages_arches[arch], options) + if filesonly: + filesonly_update(arch, repo, packages_arches[arch], force) + else: + db_update(arch, repo, packages_arches[arch], force) logger.info('Finished database updates for %s.', repo_file) return 0 -- cgit v1.2.3-2-g168b From 4d02cd5b5d4437dd1543e2d45044db72da1989f4 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Wed, 30 Nov 2011 23:56:12 -0600 Subject: reporead: fix not defined variable Way to fail at refactoring, Dan. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'devel/management/commands/reporead.py') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index e4ba8580..c444538b 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -329,6 +329,8 @@ def db_update(archname, reponame, pkgs, force=False): """ logger.info('Updating %s (%s)', reponame, archname) dbpkgs = update_common(archname, reponame, pkgs, True) + repository = Repo.objects.get(name__iexact=reponame) + architecture = Arch.objects.get(name__iexact=archname) # This makes our inner loop where we find packages by name *way* more # efficient by not having to go to the database for each package to -- cgit v1.2.3-2-g168b