# -*- coding: utf-8 -*-
"""
reporead command

Parses a repo.db.tar.gz file and updates the Arch database with the relevant
changes.

Usage: ./manage.py reporead ARCH PATH
 ARCH:  architecture to update; must be available in the database
 PATH:  full path to the repo.db.tar.gz file.

Example:
  ./manage.py reporead i686 /tmp/core.db.tar.gz
"""

# multi value blocks
REPOVARS = ['arch', 'backup', 'base', 'builddate', 'conflicts', 'csize',
            'deltas', 'depends', 'desc', 'filename', 'files', 'force',
            'groups', 'installdate', 'isize', 'license', 'md5sum',
            'name', 'optdepends', 'packager', 'provides', 'reason',
            'replaces', 'size', 'url', 'version']


from django.core.management.base import BaseCommand, CommandError
from django.contrib.auth.models import User
from django.db import transaction
from django.db.models import Q

import os
import re
import sys
import tarfile
import logging
from datetime import datetime
from optparse import make_option

from cStringIO import StringIO
from logging import ERROR, WARNING, INFO, DEBUG

from main.models import Arch, Package, Repo

class SomethingFishyException(Exception):
    '''Raised when the database looks like its going to wipe out a bunch of
    packages.'''
    pass

logging.basicConfig(
    level=WARNING,
    format='%(asctime)s -> %(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    stream=sys.stderr)
logger = logging.getLogger()

class Command(BaseCommand):
    option_list = BaseCommand.option_list + (
        make_option('-f', '--force', action='store_true', dest='force', default=False,
            help='Force a re-import of data for all packages instead of only new ones. Will not touch the \'last updated\' value.'),
        make_option('--filesonly', action='store_true', dest='filesonly', default=False,
            help='Load filelists if they are outdated, but will not add or remove any packages. Will not touch the \'last updated\' value.'),
    )
    help = "Runs a package repository import for the given arch and file."
    args = "<arch> <filename>"

    def handle(self, arch=None, filename=None, **options):
        if not arch:
            raise CommandError('Architecture is required.')
        if not validate_arch(arch):
            raise CommandError('Specified architecture %s is not currently known.' % arch)
        if not filename:
            raise CommandError('Package database file is required.')
        filename = os.path.normpath(filename)
        if not os.path.exists(filename) or not os.path.isfile(filename):
            raise CommandError('Specified package database file does not exist.')

        v = int(options.get('verbosity', 0))
        if v == 0:
            logger.level = ERROR
        elif v == 1:
            logger.level = INFO
        elif v == 2:
            logger.level = DEBUG

        import signal, traceback
        signal.signal(signal.SIGQUIT,
                lambda sig, stack: traceback.print_stack(stack))

        return read_repo(arch, filename, options)


class Pkg(object):
    """An interim 'container' object for holding Arch package data."""

    def __init__(self, val, repo):
        selfdict = {}
        squash = ['arch', 'builddate', 'csize', 'desc', 'filename',
                  'installdate', 'isize', 'license', 'md5sum',
                  'packager', 'size', 'url']

        selfdict['name'] = val['name'][0]
        selfdict['base'] = None
        del val['name']
        if 'license' not in val:
            val['license'] = []
        for x in val.keys():
            if x in squash:
                if val[x] == None or len(val[x]) == 0:
                    logger.warning("Package %s has no %s" % (selfdict['name'], x))
                    selfdict[x] = None
                else:
                    selfdict[x] = ', '.join(val[x])
                    # make sure we don't have elements larger than the db char
                    # fields
                    if len(selfdict[x]) > 255:
                        selfdict[x] = selfdict[x][:254]
            elif x == 'base':
                selfdict[x] = val[x][0]
            elif x == 'force':
                selfdict[x] = True
            elif x == 'version':
                version = val[x][0].rsplit('-')
                selfdict['ver'] = version[0]
                selfdict['rel'] = version[1]
            elif x == 'reason':
                selfdict[x] = int(val[x][0])
            else:
                selfdict[x] = val[x]
        self.__dict__ = selfdict
        self.repo = repo

    def __getattr__(self, name):
        if name == 'force':
            return False
        else:
            return None


def find_user(userstring):
    '''
    Attempt to find the corresponding User object for a standard
    packager string, e.g. something like
        'A. U. Thor <author@example.com>'.
    We start by searching for a matching email address; we then move onto
    matching by first/last name. If we cannot find a user, then return None.
    '''
    if userstring in find_user.cache:
        return find_user.cache[userstring]
    matches = re.match(r'^([^<]+)? ?<([^>]*)>', userstring)
    if not matches:
        return None

    user = None
    name = matches.group(1)
    email = matches.group(2)

    def user_email():
        return User.objects.get(email=email)
    def profile_email():
        return User.objects.get(userprofile_user__public_email=email)
    def user_name():
        # yes, a bit odd but this is the easiest way since we can't always be
        # sure how to split the name. Ensure every 'token' appears in at least
        # one of the two name fields.
        name_q = Q()
        for token in name.split():
            name_q &= (Q(first_name__icontains=token) |
                    Q(last_name__icontains=token))
        return User.objects.get(name_q)

    for matcher in (user_email, profile_email, user_name):
        try:
            user = matcher()
            break
        except (User.DoesNotExist, User.MultipleObjectsReturned):
            pass

    find_user.cache[userstring] = user
    return user

# cached mappings of user strings -> User objects so we don't have to do the
# lookup more than strictly necessary.
find_user.cache = {}

def populate_pkg(dbpkg, repopkg, force=False, timestamp=None):
    if repopkg.base:
        dbpkg.pkgbase = repopkg.base
    else:
        dbpkg.pkgbase = repopkg.name
    dbpkg.pkgver = repopkg.ver
    dbpkg.pkgrel = repopkg.rel
    dbpkg.pkgdesc = repopkg.desc
    dbpkg.license = repopkg.license
    dbpkg.url = repopkg.url
    dbpkg.filename = repopkg.filename
    dbpkg.compressed_size = int(repopkg.csize)
    dbpkg.installed_size = int(repopkg.isize)
    try:
        dbpkg.build_date = datetime.utcfromtimestamp(int(repopkg.builddate))
    except ValueError:
        try:
            dbpkg.build_date = datetime.strptime(repopkg.builddate,
                    '%a %b %d %H:%M:%S %Y')
        except ValueError:
            logger.warning('Package %s had unparsable build date %s' % \
                    (repopkg.name, repopkg.builddate))
    dbpkg.packager_str = repopkg.packager
    # attempt to find the corresponding django user for this string
    dbpkg.packager = find_user(repopkg.packager)

    if timestamp:
        dbpkg.flag_date = None
        dbpkg.last_update = timestamp
    dbpkg.save()

    populate_files(dbpkg, repopkg, force=force)

    dbpkg.packagedepend_set.all().delete()
    if 'depends' in repopkg.__dict__:
        for y in repopkg.depends:
            # make sure we aren't adding self depends..
            # yes *sigh* i have seen them in pkgbuilds
            dpname, dpvcmp = re.match(r"([a-z0-9._+-]+)(.*)", y).groups()
            if dpname == repopkg.name:
                logger.warning('Package %s has a depend on itself' % repopkg.name)
                continue
            dbpkg.packagedepend_set.create(depname=dpname, depvcmp=dpvcmp)
            logger.debug('Added %s as dep for pkg %s' % (dpname, repopkg.name))

    dbpkg.packagegroup_set.all().delete()
    if 'groups' in repopkg.__dict__:
        for y in repopkg.groups:
            dbpkg.packagegroup_set.create(name=y)


def populate_files(dbpkg, repopkg, force=False):
    if not force:
        if not dbpkg.files_last_update or not dbpkg.last_update:
            pass
        elif dbpkg.files_last_update > dbpkg.last_update:
            return
    # only delete files if we are reading a DB that contains them
    if 'files' in repopkg.__dict__:
        dbpkg.packagefile_set.all().delete()
        logger.info("adding %d files for package %s" % (len(repopkg.files), dbpkg.pkgname))
        for x in repopkg.files:
            dbpkg.packagefile_set.create(path=x)
        dbpkg.files_last_update = datetime.now()
        dbpkg.save()

def db_update(archname, reponame, pkgs, options):
    """
    Parses a list and updates the Arch dev database accordingly.

    Arguments:
      pkgs -- A list of Pkg objects.

    """
    logger.info('Updating Arch: %s' % archname)
    force = options.get('force', False)
    filesonly = options.get('filesonly', False)
    repository = Repo.objects.get(name__iexact=reponame)
    architecture = Arch.objects.get(name__iexact=archname)
    dbpkgs = Package.objects.filter(arch=architecture, repo=repository)
    # It makes sense to fully evaluate our DB query now because we will
    # be using 99% of the objects in our "in both sets" loop. Force eval
    # by calling list() on the QuerySet.
    list(dbpkgs)
    # This makes our inner loop where we find packages by name *way* more
    # efficient by not having to go to the database for each package to
    # SELECT them by name.
    dbdict = dict([(pkg.pkgname, pkg) for pkg in dbpkgs])

    # go go set theory!
    # thank you python for having a set class <3
    logger.debug("Creating sets")
    dbset = set([pkg.pkgname for pkg in dbpkgs])
    syncset = set([pkg.name for pkg in pkgs])
    logger.info("%d packages in current web DB" % len(dbset))
    logger.info("%d packages in new updating db" % len(syncset))
    # packages in syncdb and not in database (add to database)
    logger.debug("Set theory: Packages in syncdb not in database")
    in_sync_not_db = syncset - dbset
    logger.info("%d packages in sync not db" % len(in_sync_not_db))

    # Try to catch those random orphaning issues that make Eric so unhappy.
    if len(dbset) > 20:
        dbpercent = 100.0 * len(syncset) / len(dbset)
    else:
        # we don't have 20 packages in this repo/arch, so this check could
        # produce a lot of false positives (or a div by zero). fake it
        dbpercent = 100.0
    logger.info("DB package ratio: %.1f%%" % dbpercent)
    if dbpercent < 50.0 and not repository.testing:
        logger.error(".db.tar.gz has %.1f%% the number of packages in the web database" % dbpercent)
        raise SomethingFishyException(
            'It looks like the syncdb is less than half the size of the web db. WTF?')

    if dbpercent < 75.0:
        logger.warning(".db.tar.gz has %.1f%% the number of packages in the web database." % dbpercent)

    if not filesonly:
        # packages in syncdb and not in database (add to database)
        logger.debug("Set theory: Packages in syncdb not in database")
        for p in [x for x in pkgs if x.name in in_sync_not_db]:
            logger.info("Adding package %s", p.name)
            pkg = Package(pkgname = p.name, arch = architecture, repo = repository)
            populate_pkg(pkg, p, timestamp=datetime.now())

        # packages in database and not in syncdb (remove from database)
        logger.debug("Set theory: Packages in database not in syncdb")
        in_db_not_sync = dbset - syncset
        for p in in_db_not_sync:
            logger.info("Removing package %s from database", p)
            Package.objects.get(
                pkgname=p, arch=architecture, repo=repository).delete()

    # packages in both database and in syncdb (update in database)
    logger.debug("Set theory: Packages in database and syncdb")
    pkg_in_both = syncset & dbset
    for p in [x for x in pkgs if x.name in pkg_in_both]:
        logger.debug("Looking for package updates")
        dbp = dbdict[p.name]
        timestamp = None
        # for a force, we don't want to update the timestamp.
        # for a non-force, we don't want to do anything at all.
        if filesonly:
            pass
        elif '-'.join((p.ver, p.rel)) == '-'.join((dbp.pkgver, dbp.pkgrel)):
            if not force:
                continue
        else:
            timestamp = datetime.now()
        if filesonly:
            logger.debug("Checking files for package %s in database", p.name)
            populate_files(dbp, p)
        else:
            logger.info("Updating package %s in database", p.name)
            populate_pkg(dbp, p, force=force, timestamp=timestamp)

    logger.info('Finished updating Arch: %s' % archname)


def parse_inf(iofile):
    """
    Parses an Arch repo db information file, and returns variables as a list.

    Arguments:
     iofile -- A StringIO, FileType, or other object with readlines method.

    """
    store = {}
    lines = iofile.readlines()
    blockname = None
    max_len = len(lines)
    i = 0
    while i < max_len:
        line = lines[i].strip()
        if len(line) > 0 and line[0] == '%' and line[1:-1].lower() in REPOVARS:
            blockname = line[1:-1].lower()
            logger.debug("Parsing package block %s", blockname)
            store[blockname] = []
            i += 1
            while i < max_len and len(lines[i].strip()) > 0:
                store[blockname].append(lines[i].strip())
                i += 1
            # here is where i would convert arrays to strings
            # based on count and type, but i dont think it is needed now
        i += 1

    return store


def parse_repo(repopath):
    """
    Parses an Arch repo db file, and returns a list of Pkg objects.

    Arguments:
     repopath -- The path of a repository db file.

    """
    logger.info("Starting repo parsing")
    if not os.path.exists(repopath):
        logger.error("Could not read file %s", repopath)

    logger.info("Reading repo tarfile %s", repopath)
    filename = os.path.split(repopath)[1]
    m = re.match(r"^(.*)\.(db|files)\.tar\.(.*)$", filename)
    if m:
        reponame = m.group(1)
    else:
        logger.error("File does not have the proper extension")
        raise SomethingFishyException("File does not have the proper extension")

    repodb = tarfile.open(repopath,"r:gz")
    ## assuming well formed tar, with dir first then files after
    ## repo-add enforces this
    logger.debug("Starting package parsing")
    dbfiles = ('desc', 'depends', 'files')
    pkgs = []
    tpkg = None
    while True:
        tarinfo = repodb.next()
        if tarinfo == None or tarinfo.isdir():
            if tpkg != None:
                tpkg.reset()
                data = parse_inf(tpkg)
                p = Pkg(data, reponame)
                logger.debug("Done parsing package %s", p.name)
                pkgs.append(p)
            if tarinfo == None:
                break
            # set new tpkg
            tpkg = StringIO()
        if tarinfo.isreg():
            fname = os.path.split(tarinfo.name)[1]
            if fname in dbfiles:
                tpkg.write(repodb.extractfile(tarinfo).read())
                tpkg.write('\n') # just in case
    repodb.close()
    logger.info("Finished repo parsing")
    return (reponame, pkgs)

def validate_arch(arch):
    "Check if arch is valid."
    available_arches = [x.name for x in Arch.objects.all()]
    return arch in available_arches

@transaction.commit_on_success
def read_repo(primary_arch, file, options):
    """
    Parses repo.db.tar.gz file and returns exit status.
    """
    repo, packages = parse_repo(file)

    # sort packages by arch -- to handle noarch stuff
    packages_arches = {}
    packages_arches['any'] = []
    packages_arches[primary_arch] = []

    for package in packages:
        if package.arch in ('any', primary_arch):
            packages_arches[package.arch].append(package)
        else:
            # we don't include mis-arched packages
            logger.warning("Package %s arch = %s" % (
                package.name,package.arch))
    logger.info('Starting database updates.')
    for (arch, pkgs) in packages_arches.items():
        db_update(arch, repo, pkgs, options)
    logger.info('Finished database updates.')
    return 0

# vim: set ts=4 sw=4 et: