From c1697ab694fe549d7b6ff81a00737a2ad63e9461 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Wed, 10 Feb 2010 21:28:49 -0600 Subject: reporead: turn into a django-admin command Rather than struggle with getting the environment set up, let's make this a custom Django admin command and use the flexibility that gives us. This is the initial rough cut of making it happen; further commits should clean up some of the rough edges. Signed-off-by: Dan McGee --- devel/management/__init__.py | 0 devel/management/commands/__init__.py | 0 devel/management/commands/reporead.py | 339 ++++++++++++++++++++++++++++++++++ 3 files changed, 339 insertions(+) create mode 100644 devel/management/__init__.py create mode 100644 devel/management/commands/__init__.py create mode 100755 devel/management/commands/reporead.py (limited to 'devel/management') diff --git a/devel/management/__init__.py b/devel/management/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/devel/management/commands/__init__.py b/devel/management/commands/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py new file mode 100755 index 00000000..b53e259c --- /dev/null +++ b/devel/management/commands/reporead.py @@ -0,0 +1,339 @@ +# -*- coding: utf-8 -*- +""" +reporead command + +Parses a repo.db.tar.gz file and updates the Arch database with the relevant +changes. + +Usage: ./manage.py reporead ARCH PATH + ARCH: architecture to update, and can be one of: i686, x86_64 + PATH: full path to the repo.db.tar.gz file. + +Example: + ./manage.py reporead i686 /tmp/core.db.tar.gz +""" + +# multi value blocks +REPOVARS = ['arch', 'backup', 'base', 'builddate', 'conflicts', 'csize', + 'deltas', 'depends', 'desc', 'filename', 'files', 'force', + 'groups', 'installdate', 'isize', 'license', 'md5sum', + 'name', 'optdepends', 'packager', 'provides', 'reason', + 'replaces', 'size', 'url', 'version'] + + +from django.core.management.base import BaseCommand +from django.conf import settings +from django.db import models, transaction +from django.core import management + +import os +import re +import sys +import tarfile +import logging +from datetime import datetime +from optparse import make_option + +from cStringIO import StringIO +from logging import WARNING,INFO,DEBUG + +from main.models import Arch, Package, Repo + +class SomethingFishyException(Exception): + '''Raised when the database looks like its going to wipe out a bunch of + packages.''' + pass + +logging.basicConfig( + level=WARNING, + format='%(asctime)s -> %(levelname)s: %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + stream=sys.stderr) +logger = logging.getLogger() + +class Command(BaseCommand): + option_list = BaseCommand.option_list + + def handle(self, arch=None, file=None, **options): + logger.level = INFO + if arch == None or file == None: + usage() + return 0 + file = os.path.normpath(file) + read_repo(arch, file) + + +class Pkg(object): + """An interim 'container' object for holding Arch package data.""" + + def __init__(self, val): + selfdict = {} + squash = ['arch', 'builddate', 'csize', 'desc', 'filename', + 'installdate', 'isize', 'license', 'md5sum', + 'packager', 'size', 'url'] + + selfdict['name'] = val['name'][0] + selfdict['base'] = None + del val['name'] + if 'desc' not in val: + logger.warning("Package %s has no description" % selfdict['name']) + val['desc'] = None + if 'url' not in val: + val['url'] = None + if 'license' not in val: + val['license'] = [] + for x in val.keys(): + if x in squash: + if val[x] == None or len(val[x]) == 0: + logger.warning("Package %s has no %s" % (selfdict['name'],x)) + selfdict[x] = None + else: + selfdict[x] = ', '.join(val[x]) + # make sure we don't have elements larger than the db char + # fields + if len(selfdict[x]) > 255: + selfdict[x] = selfdict[x][:254] + elif x == 'base': + selfdict[x] = val[x][0] + elif x == 'force': + selfdict[x] = True + elif x == 'version': + version = val[x][0].rsplit('-') + selfdict['ver'] = version[0] + selfdict['rel'] = version[1] + elif x == 'reason': + selfdict[x] = int(val[x][0]) + else: + selfdict[x] = val[x] + self.__dict__ = selfdict + + def __getattr__(self,name): + if name == 'force': + return False + else: + return None + + +def usage(): + """Print the usage of this application.""" + print __doc__.strip() + + +def populate_pkg(dbpkg, repopkg, timestamp=None): + if not timestamp: timestamp = datetime.now() + dbpkg.pkgbase = repopkg.base + dbpkg.pkgver = repopkg.ver + dbpkg.pkgrel = repopkg.rel + dbpkg.pkgdesc = repopkg.desc + dbpkg.license = repopkg.license + dbpkg.url = repopkg.url + dbpkg.needupdate = False + dbpkg.last_update = timestamp + dbpkg.save() + # files are not in the repo.db.tar.gz + #for x in repopkg.files: + # dbpkg.packagefile_set.create(path=x) + dbpkg.packagedepend_set.all().delete() + if 'depends' in repopkg.__dict__: + for y in repopkg.depends: + # make sure we aren't adding self depends.. + # yes *sigh* i have seen them in pkgbuilds + dpname,dpvcmp = re.match(r"([a-z0-9._+-]+)(.*)", y).groups() + if dpname == repopkg.name: + logger.warning('Package %s has a depend on itself' % repopkg.name) + continue + dbpkg.packagedepend_set.create(depname=dpname, depvcmp=dpvcmp) + logger.debug('Added %s as dep for pkg %s' % (dpname,repopkg.name)) + + +def db_update(archname, pkgs): + """ + Parses a list and updates the Arch dev database accordingly. + + Arguments: + pkgs -- A list of Pkg objects. + + """ + logger.info('Updating Arch: %s' % archname) + repository = Repo.objects.get(name__iexact=pkgs[0].repo) + architecture = Arch.objects.get(name__iexact=archname) + dbpkgs = Package.objects.filter(arch=architecture, repo=repository) + # It makes sense to fully evaluate our DB query now because we will + # be using 99% of the objects in our "in both sets" loop. Force eval + # by calling list() on the QuerySet. + list(dbpkgs) + # This makes our inner loop where we find packages by name *way* more + # efficient by not having to go to the database for each package to + # SELECT them by name. + dbdict = dict([(pkg.pkgname, pkg) for pkg in dbpkgs]) + now = datetime.now() + + # go go set theory! + # thank you python for having a set class <3 + logger.debug("Creating sets") + dbset = set([pkg.pkgname for pkg in dbpkgs]) + syncset = set([pkg.name for pkg in pkgs]) + logger.info("%d packages in current web DB" % len(dbset)) + logger.info("%d packages in new updating db" % len(syncset)) + # packages in syncdb and not in database (add to database) + logger.debug("Set theory: Packages in syncdb not in database") + in_sync_not_db = syncset - dbset + logger.info("%d packages in sync not db" % len(in_sync_not_db)) + + # Try to catch those random orphaning issues that make Eric so unhappy. + if len(dbset) > 20: + dbpercent = 100.0 * len(syncset) / len(dbset) + else: + # we don't have 20 packages in this repo/arch, so this check could + # produce a lot of false positives (or a div by zero). fake it + dbpercent = 100.0 + logger.info("DB package ratio: %.1f%%" % dbpercent) + if dbpercent < 50.0 and repository.name.lower().find('testing') == -1: + logger.error(".db.tar.gz has %.1f%% the number of packages in the web database" % dbpercent) + raise SomethingFishyException( + 'It looks like the syncdb is less than half the size of the web db. WTF?') + + if dbpercent < 75.0: + logger.warning(".db.tar.gz has %.1f%% the number of packages in the web database." % dbpercent) + + for p in [x for x in pkgs if x.name in in_sync_not_db]: + logger.info("Adding package %s", p.name) + pkg = Package(pkgname = p.name, arch = architecture, repo = repository) + populate_pkg(pkg, p, timestamp=now) + + # packages in database and not in syncdb (remove from database) + logger.debug("Set theory: Packages in database not in syncdb") + in_db_not_sync = dbset - syncset + for p in in_db_not_sync: + logger.info("Removing package %s from database", p) + Package.objects.get( + pkgname=p, arch=architecture, repo=repository).delete() + + # packages in both database and in syncdb (update in database) + logger.debug("Set theory: Packages in database and syncdb") + pkg_in_both = syncset & dbset + for p in [x for x in pkgs if x.name in pkg_in_both]: + logger.debug("Looking for package updates") + dbp = dbdict[p.name] + if ''.join((p.ver,p.rel)) == ''.join((dbp.pkgver,dbp.pkgrel)): + continue + logger.info("Updating package %s in database", p.name) + pkg = Package.objects.get( + pkgname=p.name,arch=architecture, repo=repository) + populate_pkg(pkg, p, timestamp=now) + + logger.info('Finished updating Arch: %s' % archname) + + +def parse_inf(iofile): + """ + Parses an Arch repo db information file, and returns variables as a list. + + Arguments: + iofile -- A StringIO, FileType, or other object with readlines method. + + """ + store = {} + lines = iofile.readlines() + blockname = None + max = len(lines) + i = 0 + while i < max: + line = lines[i].strip() + if len(line) > 0 and line[0] == '%' and line[1:-1].lower() in REPOVARS: + blockname = line[1:-1].lower() + logger.debug("Parsing package block %s",blockname) + store[blockname] = [] + i += 1 + while i < max and len(lines[i].strip()) > 0: + store[blockname].append(lines[i].strip()) + i += 1 + # here is where i would convert arrays to strings + # based on count and type, but i dont think it is needed now + i += 1 + + return store + + +def parse_repo(repopath): + """ + Parses an Arch repo db file, and returns a list of Pkg objects. + + Arguments: + repopath -- The path of a repository db file. + + """ + logger.info("Starting repo parsing") + if not os.path.exists(repopath): + logger.error("Could not read file %s", repopath) + + logger.info("Reading repo tarfile %s", repopath) + filename = os.path.split(repopath)[1] + rindex = filename.rindex('.db.tar.gz') + reponame = filename[:rindex] + + repodb = tarfile.open(repopath,"r:gz") + ## assuming well formed tar, with dir first then files after + ## repo-add enforces this + logger.debug("Starting package parsing") + pkgs = [] + tpkg = None + while True: + tarinfo = repodb.next() + if tarinfo == None or tarinfo.isdir(): + if tpkg != None: + tpkg.reset() + data = parse_inf(tpkg) + p = Pkg(data) + p.repo = reponame + logger.debug("Done parsing package %s", p.name) + pkgs.append(p) + if tarinfo == None: + break + # set new tpkg + tpkg = StringIO() + if tarinfo.isreg(): + if os.path.split(tarinfo.name)[1] in ('desc','depends'): + tpkg.write(repodb.extractfile(tarinfo).read()) + tpkg.write('\n') # just in case + repodb.close() + logger.info("Finished repo parsing") + return pkgs + +@transaction.commit_on_success +def read_repo(arch, file): + """ + Parses repo.db.tar.gz file and returns exit status. + """ + # check if arch is valid + available_arches = [x.name for x in Arch.objects.all()] + if arch not in available_arches: + usage() + return 0 + else: + primary_arch = arch + + packages = parse_repo(file) + + # sort packages by arch -- to handle noarch stuff + packages_arches = {} + for arch in available_arches: + packages_arches[arch] = [] + + for package in packages: + if package.arch in ('any', primary_arch): + packages_arches[package.arch].append(package) + else: + logger.warning("Package %s arch = %s" % ( + package.name,package.arch)) + #package.arch = primary_arch + + + logger.info('Starting database updates.') + for (arch, pkgs) in packages_arches.iteritems(): + if len(pkgs) > 0: + db_update(arch,pkgs) + logger.info('Finished database updates.') + return 0 + +# vim: set ts=4 sw=4 et: -- cgit v1.2.3-2-g168b