#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
reporead.py

Parses a repo.db.tar.gz file and updates the Arch database with the relevant
changes.

Usage: reporead.py ARCH PATH
 ARCH:  architecture to update, and can be one of: i686, x86_64
 PATH:  full path to the repo.db.tar.gz file.

Example:
  reporead.py i686 /tmp/core.db.tar.gz

"""

###
### User Variables
###

# multi value blocks
REPOVARS = ['arch', 'backup', 'builddate', 'conflicts', 'csize', 
            'deltas', 'depends', 'desc', 'filename', 'files', 'force', 
            'groups', 'installdate', 'isize', 'license', 'md5sum', 
            'name', 'optdepends', 'packager', 'provides', 'reason', 
            'replaces', 'size', 'url', 'version']

###
### Imports
###

import os
import re
import sys
import tarfile
import logging
from datetime import datetime
from django.core.management import setup_environ
# mung the sys path to get to django root dir, no matter
# where we are called from
archweb_app_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
os.chdir(archweb_app_path)
sys.path[0] = archweb_app_path
import settings
setup_environ(settings)
from cStringIO import StringIO
from logging import WARNING,INFO,DEBUG
from main.models import Arch, Package, Repo

class SomethingFishyException(Exception):
    '''Raised when the database looks like its going to wipe out a bunch of
    packages.'''
    pass

###
### Initialization
###

logging.basicConfig(
    level=WARNING,
    format='%(asctime)s -> %(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    stream=sys.stderr)
logger = logging.getLogger()


###
### function and class definitions
###

class Pkg(object):
    """An interim 'container' object for holding Arch package data."""

    def __init__(self, val):
        selfdict = {}
        squash = ['arch', 'builddate', 'csize', 'desc', 'filename',
                  'installdate', 'isize', 'license', 'md5sum', 
                  'packager', 'size', 'url']
        
        selfdict['name'] = val['name'][0]
        del val['name']
        if 'desc' not in val:
            logger.warning("Package %s has no description" % selfdict['name'])
            val['desc'] = ''
        if 'url' not in val:
            val['url'] = ''
        if 'license' not in val:
            val['license'] = []
        for x in val.keys():
            if x in squash:
                if len(val[x]) == 0:
                    logger.warning("Package %s has no %s" % (selfdict['name'],x))
                selfdict[x] = ''.join(val[x])
                # make sure we don't have elements larger than the db char
                # fields
                if len(selfdict[x]) > 255:
                    selfdict[x] = selfdict[x][:254]
            elif x == 'force':
                selfdict[x] = True
            elif x == 'version':
                version = val[x][0].rsplit('-')
                selfdict['ver'] = version[0]
                selfdict['rel'] = version[1]
            elif x == 'reason':
                selfdict[x] = int(val[x][0])
            else:
                selfdict[x] = val[x]
        self.__dict__ = selfdict
    
    def __getattr__(self,name):
        if name == 'force':
            return False
        else:
            return None


def usage():
    """Print the usage of this application."""
    print __doc__.strip()


def fetchiter_dict(cursor):
    """
    Given a DB API 2.0 cursor object that has been executed, returns a 
    dictionary that maps each field to a column index
    """
    rows = cursor.fetchmany(size=30)
    while rows:
        for row in rows:
            #pp(rows)
            #for row in rows:
            yield dictize(cursor,row)
        rows = cursor.fetchmany(size=30)


def fetchone_dict(cursor):
    """
    Given a DB API 2.0 cursor object that has been executed, returns a 
    dictionary that maps each field to a column index
    """
    results = {}
    row = cursor.fetchone()
    return dictize(cursor,row)


def dictize(cursor,row):
    result = {}
    for column,desc in enumerate(cursor.description):
        result[desc[0]] = row[column]
    return result


def populate_pkg(dbpkg, repopkg, timestamp=None):
    if not timestamp: timestamp = datetime.now()
    dbpkg.pkgver = repopkg.ver
    dbpkg.pkgrel = repopkg.rel
    dbpkg.pkgdesc = repopkg.desc
    dbpkg.license = repopkg.license
    dbpkg.url = repopkg.url
    dbpkg.needupdate = False
    dbpkg.last_update = timestamp
    dbpkg.save()
    # files are not in the repo.db.tar.gz
    #for x in repopkg.files:
    #    dbpkg.packagefile_set.create(path=x)
    dbpkg.packagedepend_set.all().delete()
    if 'depends' in repopkg.__dict__:
        for y in repopkg.depends:
            # make sure we aren't adding self depends..
            # yes *sigh* i have seen them in pkgbuilds
            dpname,dpvcmp = re.match(r"([a-z0-9._+-]+)(.*)", y).groups()
            if dpname == repopkg.name:
                logger.warning('Package %s has a depend on itself' % repopkg.name)
                continue
            dbpkg.packagedepend_set.create(depname=dpname, depvcmp=dpvcmp)
            logger.debug('Added %s as dep for pkg %s' % (dpname,repopkg.name))


def db_update(archname, pkgs):
    """
    Parses a list and updates the Arch dev database accordingly.

    Arguments:
      pkgs -- A list of Pkg objects.
    
    """
    logger.info('Updating Arch: %s' % archname)
    repository = Repo.objects.get(name__iexact=pkgs[0].repo)
    architecture = Arch.objects.get(name__iexact=archname)
    dbpkgs = Package.objects.filter(arch=architecture, repo=repository)
    # It makes sense to fully evaluate our DB query now because we will
    # be using 99% of the objects in our "in both sets" loop. Force eval
    # by calling list() on the QuerySet.
    list(dbpkgs)
    # This makes our inner loop where we find packages by name *way* more
    # efficient by not having to go to the database for each package to
    # SELECT them by name.
    dbdict = dict([(pkg.pkgname, pkg) for pkg in dbpkgs])
    now = datetime.now()

    # go go set theory!
    # thank you python for having a set class <3
    logger.debug("Creating sets")
    dbset = set([pkg.pkgname for pkg in dbpkgs])
    syncset = set([pkg.name for pkg in pkgs])
    logger.info("%d packages in current web DB" % len(dbset))
    logger.info("%d packages in new updating db" % len(syncset))
    # packages in syncdb and not in database (add to database)
    logger.debug("Set theory: Packages in syncdb not in database")
    in_sync_not_db = syncset - dbset
    logger.info("%d packages in sync not db" % len(in_sync_not_db))

    # Try to catch those random orphaning issues that make Eric so unhappy.
    if len(syncset) < len(dbset) * .5:
        logger.error(".db.tar.gz has less than 50% the number of packages in the web database")
        if repository.name != 'Testing':
            raise SomethingFishyException(
                'It looks like the syncdb is half the size of the web db. WTF?')

    if len(syncset) < len(dbset) * .75:
        logger.warning(".db.tar.gz has 75% the number of packages in the web database.")
    
    for p in [x for x in pkgs if x.name in in_sync_not_db]:
        logger.debug("Adding package %s", p.name)
        ## note: maintainer is being set to orphan for now
        ## maybe later we can add logic to match pkgbuild maintainers 
        ## to db maintainer ids
        pkg = Package(
            pkgname = p.name, arch = architecture, repo = repository,
            maintainer_id = 0)
        populate_pkg(pkg, p, timestamp=now)

    # packages in database and not in syncdb (remove from database)
    logger.debug("Set theory: Packages in database not in syncdb")
    in_db_not_sync = dbset - syncset
    for p in in_db_not_sync:
        logger.info("Removing package %s from database", p)
        Package.objects.get(
            pkgname=p, arch=architecture, repo=repository).delete()

    # packages in both database and in syncdb (update in database)
    logger.debug("Set theory: Packages in database and syncdb")
    pkg_in_both = syncset & dbset
    for p in [x for x in pkgs if x.name in pkg_in_both]:
        logger.debug("Looking for package updates")
        dbp = dbdict[p.name]
        if ''.join((p.ver,p.rel)) == ''.join((dbp.pkgver,dbp.pkgrel)):
            continue
        logger.info("Updating package %s in database", p.name)
        pkg = Package.objects.get(
            pkgname=p.name,arch=architecture, repo=repository)
        populate_pkg(pkg, p, timestamp=now)

    logger.info('Finished updating Arch: %s' % archname)


def parse_inf(iofile):
    """
    Parses an Arch repo db information file, and returns variables as a list.

    Arguments:
     iofile -- A StringIO, FileType, or other object with readlines method.

    """
    store = {}
    lines = iofile.readlines()
    blockname = None
    max = len(lines)
    i = 0
    while i < max:
        line = lines[i].strip()
        if len(line) > 0 and line[0] == '%' and line[1:-1].lower() in REPOVARS:
            blockname = line[1:-1].lower()
            logger.debug("Parsing package block %s",blockname)
            store[blockname] = []
            i += 1
            while i < max and len(lines[i].strip()) > 0:
                store[blockname].append(lines[i].strip())
                i += 1
            # here is where i would convert arrays to strings
            # based on count and type, but i dont think it is needed now
        i += 1

    return store


def parse_repo(repopath):
    """
    Parses an Arch repo db file, and returns a list of Pkg objects.

    Arguments:
     repopath -- The path of a repository db file.

    """
    logger.info("Starting repo parsing")
    if not os.path.exists(repopath):
        logger.error("Could not read file %s", repopath)
    
    logger.info("Reading repo tarfile %s", repopath)
    filename = os.path.split(repopath)[1]
    rindex = filename.rindex('.db.tar.gz')
    reponame = filename[:rindex]
    
    repodb = tarfile.open(repopath,"r:gz")
    ## assuming well formed tar, with dir first then files after
    ## repo-add enforces this
    logger.debug("Starting package parsing")
    pkgs = []
    tpkg = None
    while True:
        tarinfo = repodb.next()
        if tarinfo == None or tarinfo.isdir():
            if tpkg != None:
                tpkg.reset()
                data = parse_inf(tpkg)
                p = Pkg(data)
                p.repo = reponame
                logger.debug("Done parsing package %s", p.name)
                pkgs.append(p)
            if tarinfo == None:
                break
            # set new tpkg
            tpkg = StringIO()
        if tarinfo.isreg():
            if os.path.split(tarinfo.name)[1] in ('desc','depends'):
                tpkg.write(repodb.extractfile(tarinfo).read())
                tpkg.write('\n') # just in case 
    repodb.close()
    logger.info("Finished repo parsing")
    return pkgs


def main(argv=None):
    """
    Parses repo.db.tar.gz file and returns exit status.

    Keyword Arguments:
     argv -- A list/array simulating a sys.argv (default None)
             If left empty, sys.argv is used

    """
    if argv == None:
        argv = sys.argv
    if len(argv) != 3:
        usage()
        return 0
    # check if arch is valid
    available_arches = [x.name for x in Arch.objects.all()]
    if argv[1] not in available_arches:
        usage()
        return 0
    else:
        primary_arch = argv[1]

    repo_file = os.path.normpath(argv[2])
    packages = parse_repo(repo_file)
    
    # sort packages by arch -- to handle noarch stuff
    packages_arches = {}
    for arch in available_arches:
        packages_arches[arch] = []
    
    for package in packages:
        if package.arch in ('any', primary_arch):
            packages_arches[package.arch].append(package)
        else:
            logger.warning("Package %s arch = %s" % (
                package.name,package.arch))
            #package.arch = primary_arch


    logger.info('Starting database updates.')
    for (arch, pkgs) in packages_arches.iteritems():
        if len(pkgs) > 0:
            db_update(arch,pkgs)
    logger.info('Finished database updates.')
    return 0


###
### Main eval 
###

if __name__ == '__main__':
    logger.level = INFO
    sys.exit(main())