summaryrefslogtreecommitdiff
path: root/devel/management/commands/reporead.py
diff options
context:
space:
mode:
Diffstat (limited to 'devel/management/commands/reporead.py')
-rw-r--r--devel/management/commands/reporead.py369
1 files changed, 197 insertions, 172 deletions
diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py
index a8875c7e..c444538b 100644
--- a/devel/management/commands/reporead.py
+++ b/devel/management/commands/reporead.py
@@ -13,12 +13,8 @@ Example:
./manage.py reporead i686 /tmp/core.db.tar.gz
"""
-from django.core.management.base import BaseCommand, CommandError
-from django.contrib.auth.models import User
-from django.db import transaction
-from django.db.models import Q
-
-import codecs
+from collections import defaultdict
+import io
import os
import re
import sys
@@ -27,14 +23,12 @@ import logging
from datetime import datetime
from optparse import make_option
-# New in 2.6, but fast (C implementation) in 2.7. We will use it over codecs if
-# available. Eventually remove the codecs import completely.
-io = None
-try:
- import io
-except ImportError:
- pass
+from django.core.management.base import BaseCommand, CommandError
+from django.contrib.auth.models import User
+from django.db import connections, router, transaction
+from django.db.utils import IntegrityError
+from devel.utils import UserFinder
from main.models import Arch, Package, PackageDepend, PackageFile, Repo
from packages.models import Conflict, Provision, Replacement
@@ -43,6 +37,8 @@ logging.basicConfig(
format='%(asctime)s -> %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
stream=sys.stderr)
+TRACE = 5
+logging.addLevelName(TRACE, 'TRACE')
logger = logging.getLogger()
class Command(BaseCommand):
@@ -58,8 +54,6 @@ class Command(BaseCommand):
def handle(self, arch=None, filename=None, **options):
if not arch:
raise CommandError('Architecture is required.')
- if not validate_arch(arch):
- raise CommandError('Specified architecture %s is not currently known.' % arch)
if not filename:
raise CommandError('Package database file is required.')
filename = os.path.normpath(filename)
@@ -74,18 +68,13 @@ class Command(BaseCommand):
elif v == 2:
logger.level = logging.DEBUG
- import signal, traceback
- handler = lambda sig, stack: traceback.print_stack(stack)
- signal.signal(signal.SIGQUIT, handler)
- signal.signal(signal.SIGUSR1, handler)
-
return read_repo(arch, filename, options)
class Pkg(object):
"""An interim 'container' object for holding Arch package data."""
- bare = ( 'name', 'base', 'arch', 'desc', 'filename',
- 'md5sum', 'url', 'builddate', 'packager' )
+ bare = ( 'name', 'base', 'arch', 'filename',
+ 'md5sum', 'sha256sum', 'url', 'packager' )
number = ( 'csize', 'isize' )
collections = ( 'depends', 'optdepends', 'conflicts',
'provides', 'replaces', 'groups', 'license', 'files' )
@@ -97,12 +86,12 @@ class Pkg(object):
self.ver = None
self.rel = None
self.epoch = 0
+ self.pgpsig = None
for k in self.bare + self.number:
setattr(self, k, None)
for k in self.collections:
setattr(self, k, ())
- # So we can tell the diffence between a package with no files, and a DB
- # without files entries
+ self.files = None
self.has_files = False
def populate(self, values):
@@ -112,18 +101,31 @@ class Pkg(object):
setattr(self, k, v[0][:254])
elif k in self.number:
setattr(self, k, long(v[0]))
+ elif k in ('desc', 'pgpsig'):
+ # do NOT prune these values at all
+ setattr(self, k, v[0])
elif k == 'version':
match = self.version_re.match(v[0])
self.ver = match.group(3)
self.rel = match.group(4)
if match.group(2):
self.epoch = int(match.group(2))
+ elif k == 'builddate':
+ try:
+ self.builddate = datetime.utcfromtimestamp(int(v[0]))
+ except ValueError:
+ try:
+ self.builddate = datetime.strptime(v[0],
+ '%a %b %d %H:%M:%S %Y')
+ except ValueError:
+ logger.warning('Package %s had unparsable build date %s',
+ self.name, v[0])
elif k == 'files':
- self.files = v
+ self.files = tuple(v)
self.has_files = True
else:
# anything left in collections
- setattr(self, k, v)
+ setattr(self, k, tuple(v))
@property
def full_version(self):
@@ -133,55 +135,6 @@ class Pkg(object):
return u'%s-%s' % (self.ver, self.rel)
-def find_user(userstring):
- '''
- Attempt to find the corresponding User object for a standard
- packager string, e.g. something like
- 'A. U. Thor <author@example.com>'.
- We start by searching for a matching email address; we then move onto
- matching by first/last name. If we cannot find a user, then return None.
- '''
- if userstring in find_user.cache:
- return find_user.cache[userstring]
- matches = re.match(r'^([^<]+)? ?<([^>]*)>', userstring)
- if not matches:
- return None
-
- user = None
- name = matches.group(1)
- email = matches.group(2)
-
- def user_email():
- return User.objects.get(email=email)
- def profile_email():
- return User.objects.get(userprofile__public_email=email)
- def user_name():
- # yes, a bit odd but this is the easiest way since we can't always be
- # sure how to split the name. Ensure every 'token' appears in at least
- # one of the two name fields.
- name_q = Q()
- for token in name.split():
- # ignore quoted parts; e.g. nicknames in strings
- if re.match(r'^[\'"].*[\'"]$', token):
- continue
- name_q &= (Q(first_name__icontains=token) |
- Q(last_name__icontains=token))
- return User.objects.get(name_q)
-
- for matcher in (user_email, profile_email, user_name):
- try:
- user = matcher()
- break
- except (User.DoesNotExist, User.MultipleObjectsReturned):
- pass
-
- find_user.cache[userstring] = user
- return user
-
-# cached mappings of user strings -> User objects so we don't have to do the
-# lookup more than strictly necessary.
-find_user.cache = {}
-
DEPEND_RE = re.compile(r"^(.+?)((>=|<=|=|>|<)(.*))?$")
def create_depend(package, dep_str, optional=False):
@@ -234,6 +187,8 @@ def create_multivalued(dbpkg, repopkg, db_attr, repo_attr):
for name in getattr(repopkg, repo_attr):
collection.create(name=name)
+finder = UserFinder()
+
def populate_pkg(dbpkg, repopkg, force=False, timestamp=None):
if repopkg.base:
dbpkg.pkgbase = repopkg.base
@@ -247,18 +202,11 @@ def populate_pkg(dbpkg, repopkg, force=False, timestamp=None):
dbpkg.filename = repopkg.filename
dbpkg.compressed_size = repopkg.csize
dbpkg.installed_size = repopkg.isize
- try:
- dbpkg.build_date = datetime.utcfromtimestamp(int(repopkg.builddate))
- except ValueError:
- try:
- dbpkg.build_date = datetime.strptime(repopkg.builddate,
- '%a %b %d %H:%M:%S %Y')
- except ValueError:
- logger.warning('Package %s had unparsable build date %s',
- repopkg.name, repopkg.builddate)
+ dbpkg.build_date = repopkg.builddate
dbpkg.packager_str = repopkg.packager
# attempt to find the corresponding django user for this string
- dbpkg.packager = find_user(repopkg.packager)
+ dbpkg.packager = finder.find(repopkg.packager)
+ dbpkg.pgp_signature = repopkg.pgpsig
if timestamp:
dbpkg.flag_date = None
@@ -287,10 +235,13 @@ def populate_pkg(dbpkg, repopkg, force=False, timestamp=None):
create_multivalued(dbpkg, repopkg, 'licenses', 'license')
+pkg_same_version = lambda pkg, dbpkg: pkg.ver == dbpkg.pkgver \
+ and pkg.rel == dbpkg.pkgrel and pkg.epoch == dbpkg.epoch
+
+
def populate_files(dbpkg, repopkg, force=False):
if not force:
- if dbpkg.pkgver != repopkg.ver or dbpkg.pkgrel != repopkg.rel \
- or dbpkg.epoch != repopkg.epoch:
+ if not pkg_same_version(repopkg, dbpkg):
logger.info("DB version (%s) didn't match repo version "
"(%s) for package %s, skipping file list addition",
dbpkg.full_version, repopkg.full_version, dbpkg.pkgname)
@@ -299,6 +250,7 @@ def populate_files(dbpkg, repopkg, force=False):
pass
elif dbpkg.files_last_update > dbpkg.last_update:
return
+
# only delete files if we are reading a DB that contains them
if repopkg.has_files:
dbpkg.packagefile_set.all().delete()
@@ -318,95 +270,155 @@ def populate_files(dbpkg, repopkg, force=False):
dbpkg.files_last_update = datetime.utcnow()
dbpkg.save()
-@transaction.commit_on_success
-def db_update(archname, reponame, pkgs, options):
- """
- Parses a list and updates the Arch dev database accordingly.
- Arguments:
- pkgs -- A list of Pkg objects.
+def select_pkg_for_update(dbpkg):
+ database = router.db_for_write(Package, instance=dbpkg)
+ connection = connections[database]
+ if 'sqlite' in connection.settings_dict['ENGINE'].lower():
+ return dbpkg
+ new_pkg = Package.objects.raw(
+ 'SELECT * FROM packages WHERE id = %s FOR UPDATE',
+ [dbpkg.id])
+ return list(new_pkg)[0]
+
+
+def update_common(archname, reponame, pkgs, sanity_check=True):
+ with transaction.commit_manually():
+ repository = Repo.objects.get(name__iexact=reponame)
+ architecture = Arch.objects.get(name__iexact=archname)
+ # no-arg order_by() removes even the default ordering; we don't need it
+ dbpkgs = Package.objects.filter(
+ arch=architecture, repo=repository).order_by()
+
+ logger.info("%d packages in current web DB", len(dbpkgs))
+ logger.info("%d packages in new updating DB", len(pkgs))
+
+ # Try to catch those random package deletions that make Eric so unhappy.
+ if len(dbpkgs):
+ dbpercent = 100.0 * len(pkgs) / len(dbpkgs)
+ else:
+ dbpercent = 0.0
+ logger.info("DB package ratio: %.1f%%", dbpercent)
+
+ # Fewer than 20 packages makes the percentage check unreliable, but it also
+ # means we expect the repo to fluctuate a lot.
+ msg = "Package database has %.1f%% the number of packages in the " \
+ "web database" % dbpercent
+ if not sanity_check:
+ pass
+ elif repository.testing or repository.staging:
+ pass
+ elif len(dbpkgs) == 0 and len(pkgs) == 0:
+ pass
+ elif len(dbpkgs) > 20 and dbpercent < 50.0:
+ logger.error(msg)
+ raise Exception(msg)
+ elif dbpercent < 75.0:
+ logger.warning(msg)
+
+ # If isolation level is repeatable-read, we need to ensure each package
+ # update starts a new transaction and re-queries the database as necessary
+ # to guard against simultaneous updates
+ transaction.commit()
+ return dbpkgs
+
+def db_update(archname, reponame, pkgs, force=False):
"""
- logger.info('Updating Arch: %s', archname)
- force = options.get('force', False)
- filesonly = options.get('filesonly', False)
+ Parses a list of packages and updates the packages database accordingly.
+ """
+ logger.info('Updating %s (%s)', reponame, archname)
+ dbpkgs = update_common(archname, reponame, pkgs, True)
repository = Repo.objects.get(name__iexact=reponame)
architecture = Arch.objects.get(name__iexact=archname)
- # no-arg order_by() removes even the default ordering; we don't need it
- dbpkgs = Package.objects.filter(
- arch=architecture, repo=repository).order_by()
+
# This makes our inner loop where we find packages by name *way* more
# efficient by not having to go to the database for each package to
# SELECT them by name.
- dbdict = dict([(pkg.pkgname, pkg) for pkg in dbpkgs])
+ dbdict = dict((dbpkg.pkgname, dbpkg) for dbpkg in dbpkgs)
- logger.debug("Creating sets")
dbset = set(dbdict.keys())
syncset = set([pkg.name for pkg in pkgs])
- logger.info("%d packages in current web DB", len(dbset))
- logger.info("%d packages in new updating db", len(syncset))
+
in_sync_not_db = syncset - dbset
logger.info("%d packages in sync not db", len(in_sync_not_db))
-
- # Try to catch those random package deletions that make Eric so unhappy.
- if len(dbset):
- dbpercent = 100.0 * len(syncset) / len(dbset)
- else:
- dbpercent = 0.0
- logger.info("DB package ratio: %.1f%%", dbpercent)
-
- # Fewer than 20 packages makes the percentage check unreliable, but it also
- # means we expect the repo to fluctuate a lot.
- msg = "Package database has %.1f%% the number of packages in the " \
- "web database" % dbpercent
- if len(dbset) == 0 and len(syncset) == 0:
- pass
- elif not filesonly and \
- len(dbset) > 20 and dbpercent < 50.0 and \
- not repository.testing and not repository.staging:
- logger.error(msg)
- raise Exception(msg)
- elif dbpercent < 75.0:
- logger.warning(msg)
-
- if not filesonly:
- # packages in syncdb and not in database (add to database)
- for p in [x for x in pkgs if x.name in in_sync_not_db]:
- logger.info("Adding package %s", p.name)
- pkg = Package(pkgname = p.name, arch = architecture, repo = repository)
- populate_pkg(pkg, p, timestamp=datetime.utcnow())
-
- # packages in database and not in syncdb (remove from database)
- in_db_not_sync = dbset - syncset
- for p in in_db_not_sync:
- logger.info("Removing package %s from database", p)
- dbp = dbdict[p]
- dbp.delete()
+ # packages in syncdb and not in database (add to database)
+ for pkg in (pkg for pkg in pkgs if pkg.name in in_sync_not_db):
+ logger.info("Adding package %s", pkg.name)
+ dbpkg = Package(pkgname=pkg.name, arch=architecture, repo=repository)
+ try:
+ with transaction.commit_on_success():
+ populate_pkg(dbpkg, pkg, timestamp=datetime.utcnow())
+ except IntegrityError:
+ logger.warning("Could not add package %s; "
+ "not fatal if another thread beat us to it.",
+ pkg.name, exc_info=True)
+
+ # packages in database and not in syncdb (remove from database)
+ for pkgname in (dbset - syncset):
+ logger.info("Removing package %s", pkgname)
+ dbpkg = dbdict[pkgname]
+ with transaction.commit_on_success():
+ # no race condition here as long as simultaneous threads both
+ # issue deletes; second delete will be a no-op
+ dbpkg.delete()
# packages in both database and in syncdb (update in database)
pkg_in_both = syncset & dbset
- for p in [x for x in pkgs if x.name in pkg_in_both]:
- logger.debug("Looking for package updates")
- dbp = dbdict[p.name]
+ for pkg in (x for x in pkgs if x.name in pkg_in_both):
+ logger.debug("Checking package %s", pkg.name)
+ dbpkg = dbdict[pkg.name]
timestamp = None
# for a force, we don't want to update the timestamp.
# for a non-force, we don't want to do anything at all.
- if filesonly:
- pass
- elif p.ver == dbp.pkgver and p.rel == dbp.pkgrel \
- and p.epoch == dbp.epoch:
+ if pkg_same_version(pkg, dbpkg):
if not force:
continue
else:
timestamp = datetime.utcnow()
- if filesonly:
- logger.debug("Checking files for package %s in database", p.name)
- populate_files(dbp, p, force=force)
- else:
- logger.info("Updating package %s in database", p.name)
- populate_pkg(dbp, p, force=force, timestamp=timestamp)
- logger.info('Finished updating Arch: %s', archname)
+ # The odd select_for_update song and dance here are to ensure
+ # simultaneous updates don't happen on a package, causing
+ # files/depends/all related items to be double-imported.
+ with transaction.commit_on_success():
+ # TODO Django 1.4 select_for_update() will work once released
+ dbpkg = select_pkg_for_update(dbpkg)
+ if pkg_same_version(pkg, dbpkg):
+ logger.debug("Package %s was already updated", pkg.name)
+ continue
+ logger.info("Updating package %s", pkg.name)
+ populate_pkg(dbpkg, pkg, force=force, timestamp=timestamp)
+
+ logger.info('Finished updating arch: %s', archname)
+
+
+def filesonly_update(archname, reponame, pkgs, force=False):
+ """
+ Parses a list of packages and updates the packages database accordingly.
+ """
+ logger.info('Updating files for %s (%s)', reponame, archname)
+ dbpkgs = update_common(archname, reponame, pkgs, False)
+ dbdict = dict((dbpkg.pkgname, dbpkg) for dbpkg in dbpkgs)
+ dbset = set(dbdict.keys())
+
+ for pkg in (pkg for pkg in pkgs if pkg.name in dbset):
+ dbpkg = dbdict[pkg.name]
+
+ # The odd select_for_update song and dance here are to ensure
+ # simultaneous updates don't happen on a package, causing
+ # files to be double-imported.
+ with transaction.commit_on_success():
+ if not dbpkg.files_last_update or not dbpkg.last_update:
+ pass
+ elif dbpkg.files_last_update > dbpkg.last_update:
+ logger.debug("Files for %s are up to date", pkg.name)
+ continue
+ # TODO Django 1.4 select_for_update() will work once released
+ dbpkg = select_pkg_for_update(dbpkg)
+ logger.debug("Checking files for package %s", pkg.name)
+ populate_files(dbpkg, pkg, force=force)
+
+ logger.info('Finished updating arch: %s', archname)
def parse_info(iofile):
@@ -421,7 +433,7 @@ def parse_info(iofile):
continue
elif line.startswith('%') and line.endswith('%'):
blockname = line[1:-1].lower()
- logger.debug("Parsing package block %s", blockname)
+ logger.log(TRACE, "Parsing package block %s", blockname)
store[blockname] = []
elif blockname:
store[blockname].append(line)
@@ -454,60 +466,73 @@ def parse_repo(repopath):
repodb = tarfile.open(repopath, "r")
logger.debug("Starting package parsing")
dbfiles = ('desc', 'depends', 'files')
- pkgs = {}
+ newpkg = lambda: Pkg(reponame)
+ pkgs = defaultdict(newpkg)
for tarinfo in repodb.getmembers():
if tarinfo.isreg():
pkgid, fname = os.path.split(tarinfo.name)
if fname not in dbfiles:
continue
data_file = repodb.extractfile(tarinfo)
- if io is None:
- data_file = codecs.EncodedFile(data_file, 'utf-8')
- else:
- data_file = io.TextIOWrapper(io.BytesIO(data_file.read()),
- encoding='utf=8')
+ data_file = io.TextIOWrapper(io.BytesIO(data_file.read()),
+ encoding='UTF-8')
try:
- data = parse_info(data_file)
- p = pkgs.setdefault(pkgid, Pkg(reponame))
- p.populate(data)
- except UnicodeDecodeError, e:
+ pkgs[pkgid].populate(parse_info(data_file))
+ except UnicodeDecodeError:
logger.warn("Could not correctly decode %s, skipping file",
tarinfo.name)
data_file.close()
+ del data_file
- logger.debug("Done parsing file %s", fname)
+ logger.debug("Done parsing file %s/%s", pkgid, fname)
repodb.close()
logger.info("Finished repo parsing, %d total packages", len(pkgs))
return (reponame, pkgs.values())
-def validate_arch(archname):
+def locate_arch(arch):
"Check if arch is valid."
- return Arch.objects.filter(name__iexact=archname).exists()
+ if isinstance(arch, Arch):
+ return arch
+ try:
+ return Arch.objects.get(name__iexact=arch)
+ except Arch.DoesNotExist:
+ raise CommandError(
+ 'Specified architecture %s is not currently known.' % arch)
+
def read_repo(primary_arch, repo_file, options):
"""
Parses repo.db.tar.gz file and returns exit status.
"""
+ # always returns an Arch object, regardless of what is passed in
+ primary_arch = locate_arch(primary_arch)
+ force = options.get('force', False)
+ filesonly = options.get('filesonly', False)
+
repo, packages = parse_repo(repo_file)
# group packages by arch -- to handle noarch stuff
packages_arches = {}
for arch in Arch.objects.filter(agnostic=True):
packages_arches[arch.name] = []
- packages_arches[primary_arch] = []
+ packages_arches[primary_arch.name] = []
for package in packages:
if package.arch in packages_arches:
packages_arches[package.arch].append(package)
else:
# we don't include mis-arched packages
- logger.warning("Package %s arch = %s",
- package.name,package.arch)
- logger.info('Starting database updates.')
+ logger.warning("Package %s arch = %s", package.name, package.arch)
+ del packages
+
+ logger.info('Starting database updates for %s.', repo_file)
for arch in sorted(packages_arches.keys()):
- db_update(arch, repo, packages_arches[arch], options)
- logger.info('Finished database updates.')
+ if filesonly:
+ filesonly_update(arch, repo, packages_arches[arch], force)
+ else:
+ db_update(arch, repo, packages_arches[arch], force)
+ logger.info('Finished database updates for %s.', repo_file)
return 0
# vim: set ts=4 sw=4 et: