From 76516cae45f3d1080065608bdb8f2d086322012f Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sun, 13 May 2012 19:45:57 -0500 Subject: Don't limit protocols returned by mirror status function If results weren't available for certain URLs, they won't show up anyway in this list, and if we start to check rsync URLs, then we want their values to come back in this status list. Signed-off-by: Dan McGee --- mirrors/utils.py | 2 -- 1 file changed, 2 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 32fa3587..54de567e 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -32,11 +32,9 @@ def annotate_url(url, delays): @cache_function(123) def get_mirror_statuses(cutoff=default_cutoff): cutoff_time = utc_now() - cutoff - protocols = list(MirrorProtocol.objects.filter(is_download=True)) # I swear, this actually has decent performance... urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( mirror__active=True, mirror__public=True, - protocol__in=protocols, logs__check_time__gte=cutoff_time).annotate( check_count=Count('logs'), success_count=Count('logs__duration'), -- cgit v1.2.3-2-g168b From ae1c526ffbe908322f0dd8d8805360b81ab22b0f Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sun, 13 May 2012 20:35:50 -0500 Subject: Add ability to restrict status report to single tier This should make it easier to catch errors in our Tier 1 mirrors. Signed-off-by: Dan McGee --- mirrors/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 54de567e..2014411d 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -88,7 +88,7 @@ def get_mirror_errors(cutoff=default_cutoff): is_success=False, check_time__gte=cutoff_time, url__mirror__active=True, url__mirror__public=True).values( 'url__url', 'url__country', 'url__protocol__protocol', - 'url__mirror__country', 'error').annotate( + 'url__mirror__country', 'url__mirror__tier', 'error').annotate( error_count=Count('error'), last_occurred=Max('check_time') ).order_by('-last_occurred', '-error_count') errors = list(errors) -- cgit v1.2.3-2-g168b From 0f3c894e7a0f573fa0198459150f387c3a7f23ae Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sun, 8 Jul 2012 20:38:01 -0500 Subject: Don't include StdDev on sqlite3 mirror status query Because this function isn't shipped by default, it makes more sense to just omit it completely from the query we do to build the tables on this page when in development. Substitute 0.0 for the value so the rest of the calculations and display work as expected. Signed-off-by: Dan McGee --- mirrors/utils.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 2014411d..9aa8e0f5 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -3,7 +3,7 @@ from datetime import timedelta from django.db.models import Avg, Count, Max, Min, StdDev from django_countries.fields import Country -from main.utils import cache_function, utc_now +from main.utils import cache_function, utc_now, database_vendor from .models import MirrorLog, MirrorProtocol, MirrorUrl @@ -40,8 +40,11 @@ def get_mirror_statuses(cutoff=default_cutoff): success_count=Count('logs__duration'), last_sync=Max('logs__last_sync'), last_check=Max('logs__check_time'), - duration_avg=Avg('logs__duration'), - duration_stddev=StdDev('logs__duration')) + duration_avg=Avg('logs__duration')) + + vendor = database_vendor(MirrorUrl) + if vendor != 'sqlite': + urls.annotate(duration_stddev=StdDev('logs__duration')) # The Django ORM makes it really hard to get actual average delay in the # above query, so run a seperate query for it and we will process the @@ -70,6 +73,9 @@ def get_mirror_statuses(cutoff=default_cutoff): check_frequency = None for url in urls: + # fake the standard deviation for local testing setups + if vendor == 'sqlite': + setattr(url, 'duration_stddev', 0.0) annotate_url(url, delays) return { -- cgit v1.2.3-2-g168b From 9c7350650e66b5eb6228778e335a160be5ea7f98 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sun, 8 Jul 2012 21:53:35 -0500 Subject: Correctly reassign queryset with added annotation in mirror status This was a dumb oversight on my part in commit 0f3c894e7a0. Signed-off-by: Dan McGee --- mirrors/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 9aa8e0f5..f2c98ee0 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -44,7 +44,7 @@ def get_mirror_statuses(cutoff=default_cutoff): vendor = database_vendor(MirrorUrl) if vendor != 'sqlite': - urls.annotate(duration_stddev=StdDev('logs__duration')) + urls = urls.annotate(duration_stddev=StdDev('logs__duration')) # The Django ORM makes it really hard to get actual average delay in the # above query, so run a seperate query for it and we will process the -- cgit v1.2.3-2-g168b From c0bf9e20660cfae7ea8994472555bba23398b598 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Tue, 24 Jul 2012 09:19:48 -0500 Subject: Remove custom utc_now() function, use django.utils.timezone.now() This was around from the time when we handled timezones sanely and Django did not; now that we are on 1.4 we no longer need our own code to handle this. Signed-off-by: Dan McGee --- mirrors/utils.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index f2c98ee0..bf030d39 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -1,13 +1,14 @@ from datetime import timedelta from django.db.models import Avg, Count, Max, Min, StdDev +from django.utils.timezone import now from django_countries.fields import Country -from main.utils import cache_function, utc_now, database_vendor +from main.utils import cache_function, database_vendor from .models import MirrorLog, MirrorProtocol, MirrorUrl -default_cutoff = timedelta(hours=24) +DEFAULT_CUTOFF = timedelta(hours=24) def annotate_url(url, delays): '''Given a MirrorURL object, add a few more attributes to it regarding @@ -30,8 +31,8 @@ def annotate_url(url, delays): @cache_function(123) -def get_mirror_statuses(cutoff=default_cutoff): - cutoff_time = utc_now() - cutoff +def get_mirror_statuses(cutoff=DEFAULT_CUTOFF): + cutoff_time = now() - cutoff # I swear, this actually has decent performance... urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( mirror__active=True, mirror__public=True, @@ -88,8 +89,8 @@ def get_mirror_statuses(cutoff=default_cutoff): @cache_function(117) -def get_mirror_errors(cutoff=default_cutoff): - cutoff_time = utc_now() - cutoff +def get_mirror_errors(cutoff=DEFAULT_CUTOFF): + cutoff_time = now() - cutoff errors = MirrorLog.objects.filter( is_success=False, check_time__gte=cutoff_time, url__mirror__active=True, url__mirror__public=True).values( @@ -105,11 +106,11 @@ def get_mirror_errors(cutoff=default_cutoff): @cache_function(295) -def get_mirror_url_for_download(cutoff=default_cutoff): +def get_mirror_url_for_download(cutoff=DEFAULT_CUTOFF): '''Find a good mirror URL to use for package downloads. If we have mirror status data available, it is used to determine a good choice by looking at the last batch of status rows.''' - cutoff_time = utc_now() - cutoff + cutoff_time = now() - cutoff status_data = MirrorLog.objects.filter( check_time__gte=cutoff_time).aggregate( Max('check_time'), Max('last_sync')) -- cgit v1.2.3-2-g168b From f0b7e73de61c03a5018ed352605e6329611448d2 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Wed, 10 Oct 2012 20:17:55 -0500 Subject: Make mirror log time query a bit more efficient We don't need the full mirror log objects; we just need a very small subset of values from them here to do the required math and object building. Signed-off-by: Dan McGee --- mirrors/utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index bf030d39..0a32b766 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -50,12 +50,14 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF): # The Django ORM makes it really hard to get actual average delay in the # above query, so run a seperate query for it and we will process the # results here. - times = MirrorLog.objects.filter(is_success=True, last_sync__isnull=False, + times = MirrorLog.objects.values_list( + 'url_id', 'check_time', 'last_sync').filter( + is_success=True, last_sync__isnull=False, check_time__gte=cutoff_time) delays = {} - for log in times: - delay = log.check_time - log.last_sync - delays.setdefault(log.url_id, []).append(delay) + for url_id, check_time, last_sync in times: + delay = check_time - last_sync + delays.setdefault(url_id, []).append(delay) if urls: last_check = max([u.last_check for u in urls]) -- cgit v1.2.3-2-g168b From 86102c6e645451c03e3e576060eba7f93350bf6b Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sat, 10 Nov 2012 14:19:23 -0600 Subject: Allow filtering retrieved mirror statuses by mirror_id When we don't need them all, no need to fetch them all. Let the database do the work for us, hopefully. Signed-off-by: Dan McGee --- mirrors/utils.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 0a32b766..ba027c99 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -31,7 +31,7 @@ def annotate_url(url, delays): @cache_function(123) -def get_mirror_statuses(cutoff=DEFAULT_CUTOFF): +def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): cutoff_time = now() - cutoff # I swear, this actually has decent performance... urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( @@ -43,6 +43,9 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF): last_check=Max('logs__check_time'), duration_avg=Avg('logs__duration')) + if mirror_ids: + urls = urls.filter(mirror_id__in=mirror_ids) + vendor = database_vendor(MirrorUrl) if vendor != 'sqlite': urls = urls.annotate(duration_stddev=StdDev('logs__duration')) @@ -54,6 +57,8 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF): 'url_id', 'check_time', 'last_sync').filter( is_success=True, last_sync__isnull=False, check_time__gte=cutoff_time) + if mirror_ids: + times = times.filter(url__mirror_id__in=mirror_ids) delays = {} for url_id, check_time, last_sync in times: delay = check_time - last_sync @@ -62,8 +67,10 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF): if urls: last_check = max([u.last_check for u in urls]) num_checks = max([u.check_count for u in urls]) - check_info = MirrorLog.objects.filter( - check_time__gte=cutoff_time).aggregate( + check_info = MirrorLog.objects.filter(check_time__gte=cutoff_time) + if mirror_ids: + check_info = check_info.filter(url__mirror_id__in=mirror_ids) + check_info = check_info.aggregate( mn=Min('check_time'), mx=Max('check_time')) if num_checks > 1: check_frequency = (check_info['mx'] - check_info['mn']) \ @@ -91,7 +98,7 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF): @cache_function(117) -def get_mirror_errors(cutoff=DEFAULT_CUTOFF): +def get_mirror_errors(cutoff=DEFAULT_CUTOFF, mirror_ids=None): cutoff_time = now() - cutoff errors = MirrorLog.objects.filter( is_success=False, check_time__gte=cutoff_time, @@ -100,6 +107,10 @@ def get_mirror_errors(cutoff=DEFAULT_CUTOFF): 'url__mirror__country', 'url__mirror__tier', 'error').annotate( error_count=Count('error'), last_occurred=Max('check_time') ).order_by('-last_occurred', '-error_count') + + if mirror_ids: + urls = urls.filter(mirror_id__in=mirror_ids) + errors = list(errors) for err in errors: ctry_code = err['url__country'] or err['url__mirror__country'] -- cgit v1.2.3-2-g168b From e26d5722289bd2e972633891d8dac09296b0cbc4 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sun, 11 Nov 2012 14:55:37 -0600 Subject: Mirror graph tweaking after usage with real data * Clamp y-axis minimum to 0. * Don't plot `is_success == false` values. * Ensure URLs are sorted predictably. Signed-off-by: Dan McGee --- mirrors/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index ba027c99..85e4ee93 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -41,7 +41,8 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): success_count=Count('logs__duration'), last_sync=Max('logs__last_sync'), last_check=Max('logs__check_time'), - duration_avg=Avg('logs__duration')) + duration_avg=Avg('logs__duration')).order_by( + 'mirror', 'url') if mirror_ids: urls = urls.filter(mirror_id__in=mirror_ids) -- cgit v1.2.3-2-g168b From a2cfa7edbbed8edb1ad4d3391c6edb055c13de1b Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Fri, 16 Nov 2012 15:39:56 -0600 Subject: Optimize mirror status data fetching Now that we have as many mirror URLs as we do, we can do a better job fetching and aggregating this data. The prior method resulted in a rather unwieldy query being pushed down to the database with a horrendously long GROUP BY clause. Instead of trying to group by everything at once so we can retrieve mirror URL info at the same time, separate the two queries- one for getting URL performance data, one for the qualitative data. The impetus behind fixing this is the PostgreSQL slow query log in production; this currently shows up the most of any queries we run in the system. Signed-off-by: Dan McGee --- mirrors/utils.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 85e4ee93..07a7138f 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -33,23 +33,26 @@ def annotate_url(url, delays): @cache_function(123) def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): cutoff_time = now() - cutoff - # I swear, this actually has decent performance... - urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( + url_data = MirrorUrl.objects.values('id', 'mirror_id').filter( mirror__active=True, mirror__public=True, logs__check_time__gte=cutoff_time).annotate( check_count=Count('logs'), success_count=Count('logs__duration'), last_sync=Max('logs__last_sync'), last_check=Max('logs__check_time'), - duration_avg=Avg('logs__duration')).order_by( - 'mirror', 'url') - - if mirror_ids: - urls = urls.filter(mirror_id__in=mirror_ids) + duration_avg=Avg('logs__duration')) vendor = database_vendor(MirrorUrl) if vendor != 'sqlite': - urls = urls.annotate(duration_stddev=StdDev('logs__duration')) + url_data = url_data.annotate(duration_stddev=StdDev('logs__duration')) + + urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( + mirror__active=True, mirror__public=True, + logs__check_time__gte=cutoff_time).order_by('mirror__id', 'url') + + if mirror_ids: + url_data = url_data.filter(mirror_id__in=mirror_ids) + urls = urls.filter(mirror_id__in=mirror_ids) # The Django ORM makes it really hard to get actual average delay in the # above query, so run a seperate query for it and we will process the @@ -66,6 +69,11 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): delays.setdefault(url_id, []).append(delay) if urls: + url_data = dict((item['id'], item) for item in url_data) + for url in urls: + for k, v in url_data.get(url.id, {}).items(): + if k not in ('id', 'mirror_id'): + setattr(url, k, v) last_check = max([u.last_check for u in urls]) num_checks = max([u.check_count for u in urls]) check_info = MirrorLog.objects.filter(check_time__gte=cutoff_time) -- cgit v1.2.3-2-g168b From f0f6f7235a62186c1cae9c79036dde5d8821373d Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Mon, 19 Nov 2012 08:10:21 -0600 Subject: Fix mirror URL duplication in status view We need to ensure we don't duplicate URLs in the status view, so add a distinct() call back in to the queryset when it was inadvertently dropped in commit a2cfa7edbb. This negates a lot of the performance gains we had, unfortunately, so it looks like a nested subquery might be more efficient. Disappointing the planner can't do this for us. Signed-off-by: Dan McGee --- mirrors/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 07a7138f..a62c7f05 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -48,7 +48,8 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( mirror__active=True, mirror__public=True, - logs__check_time__gte=cutoff_time).order_by('mirror__id', 'url') + logs__check_time__gte=cutoff_time).distinct().order_by( + 'mirror__id', 'url') if mirror_ids: url_data = url_data.filter(mirror_id__in=mirror_ids) -- cgit v1.2.3-2-g168b From bec73c7a37c07821f145dbcf11435d4f2b94a149 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sat, 12 Jan 2013 15:56:36 -0600 Subject: Round two of mirror status query improvements This seems to generate much more performant queries at the database level than what we were previously doing, and also doesn't show duplicate rows. Signed-off-by: Dan McGee --- mirrors/utils.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index a62c7f05..1d560021 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -33,9 +33,16 @@ def annotate_url(url, delays): @cache_function(123) def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): cutoff_time = now() - cutoff - url_data = MirrorUrl.objects.values('id', 'mirror_id').filter( + + valid_urls = MirrorUrl.objects.filter( mirror__active=True, mirror__public=True, - logs__check_time__gte=cutoff_time).annotate( + logs__check_time__gte=cutoff_time).distinct() + + if mirror_ids: + valid_urls = valid_urls.filter(mirror_id__in=mirror_ids) + + url_data = MirrorUrl.objects.values('id', 'mirror_id').filter( + id__in=valid_urls, logs__check_time__gte=cutoff_time).annotate( check_count=Count('logs'), success_count=Count('logs__duration'), last_sync=Max('logs__last_sync'), @@ -47,13 +54,7 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): url_data = url_data.annotate(duration_stddev=StdDev('logs__duration')) urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( - mirror__active=True, mirror__public=True, - logs__check_time__gte=cutoff_time).distinct().order_by( - 'mirror__id', 'url') - - if mirror_ids: - url_data = url_data.filter(mirror_id__in=mirror_ids) - urls = urls.filter(mirror_id__in=mirror_ids) + id__in=valid_urls).order_by('mirror__id', 'url') # The Django ORM makes it really hard to get actual average delay in the # above query, so run a seperate query for it and we will process the -- cgit v1.2.3-2-g168b From 6f0ae6746baea657ee6d7c21ac0813a04f825443 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Mon, 14 Jan 2013 01:00:11 -0600 Subject: Drop country column from mirror table We now always look for this information at the URL level, not the mirror level. This simplifies quite a bit of code in and around the mirror views. Signed-off-by: Dan McGee --- mirrors/utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 1d560021..3ab176b3 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -115,7 +115,7 @@ def get_mirror_errors(cutoff=DEFAULT_CUTOFF, mirror_ids=None): is_success=False, check_time__gte=cutoff_time, url__mirror__active=True, url__mirror__public=True).values( 'url__url', 'url__country', 'url__protocol__protocol', - 'url__mirror__country', 'url__mirror__tier', 'error').annotate( + 'url__mirror__tier', 'error').annotate( error_count=Count('error'), last_occurred=Max('check_time') ).order_by('-last_occurred', '-error_count') @@ -124,8 +124,7 @@ def get_mirror_errors(cutoff=DEFAULT_CUTOFF, mirror_ids=None): errors = list(errors) for err in errors: - ctry_code = err['url__country'] or err['url__mirror__country'] - err['country'] = Country(ctry_code) + err['country'] = Country(err['url__country']) return errors @@ -152,7 +151,7 @@ def get_mirror_url_for_download(cutoff=DEFAULT_CUTOFF): mirror_urls = MirrorUrl.objects.filter( mirror__public=True, mirror__active=True, protocol__default=True) # look first for a country-agnostic URL, then fall back to any HTTP URL - filtered_urls = mirror_urls.filter(mirror__country='')[:1] + filtered_urls = mirror_urls.filter(country='')[:1] if not filtered_urls: filtered_urls = mirror_urls[:1] if not filtered_urls: -- cgit v1.2.3-2-g168b From 2c24ee9100a9e60fec16055d6496caeda3a1d8e2 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sat, 13 Apr 2013 11:33:17 -0500 Subject: Calculate average URL delay in the database Rather than doing this in the Python code and needing 12,000+ rows returned from the database, we can do it in the database and get fewer than 300 rows back. If I recall correctly, the reason this was not done originally was due to our usage of MySQL and some really bad date math/overflow stuff it did when the interval between last_sync and check_time were greater than about a week. Luckily, we have switched to using a more sane database. Signed-off-by: Dan McGee --- mirrors/utils.py | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 3ab176b3..2721e20e 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -1,5 +1,6 @@ from datetime import timedelta +from django.db import connection from django.db.models import Avg, Count, Max, Min, StdDev from django.utils.timezone import now from django_countries.fields import Country @@ -10,13 +11,12 @@ from .models import MirrorLog, MirrorProtocol, MirrorUrl DEFAULT_CUTOFF = timedelta(hours=24) -def annotate_url(url, delays): +def annotate_url(url, delay): '''Given a MirrorURL object, add a few more attributes to it regarding status, including completion_pct, delay, and score.''' url.completion_pct = float(url.success_count) / url.check_count - if url.id in delays: - url_delays = delays[url.id] - url.delay = sum(url_delays, timedelta()) / len(url_delays) + if delay is not None: + url.delay = delay hours = url.delay.days * 24.0 + url.delay.seconds / 3600.0 if url.completion_pct > 0: @@ -30,6 +30,30 @@ def annotate_url(url, delays): url.score = None +def url_delays(cutoff_time, mirror_id=None): + cursor = connection.cursor() + if mirror_id is None: + sql= """ +SELECT url_id, AVG(check_time - last_sync) +FROM mirrors_mirrorlog +WHERE is_success = %s AND check_time >= %s AND last_sync IS NOT NULL +GROUP BY url_id +""" + cursor.execute(sql, [True, cutoff_time]) + else: + sql = """ +SELECT l.url_id, avg(check_time - last_sync) +FROM mirrors_mirrorlog l +JOIN mirrors_mirrorurl u ON u.id = l.url_id +WHERE is_success = %s AND check_time >= %s AND last_sync IS NOT NULL +AND mirror_id = %s +GROUP BY url_id +""" + cursor.execute(sql, [True, cutoff_time, mirror_id]) + + return {url_id: delay for url_id, delay in cursor.fetchall()} + + @cache_function(123) def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): cutoff_time = now() - cutoff @@ -55,20 +79,7 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( id__in=valid_urls).order_by('mirror__id', 'url') - - # The Django ORM makes it really hard to get actual average delay in the - # above query, so run a seperate query for it and we will process the - # results here. - times = MirrorLog.objects.values_list( - 'url_id', 'check_time', 'last_sync').filter( - is_success=True, last_sync__isnull=False, - check_time__gte=cutoff_time) - if mirror_ids: - times = times.filter(url__mirror_id__in=mirror_ids) - delays = {} - for url_id, check_time, last_sync in times: - delay = check_time - last_sync - delays.setdefault(url_id, []).append(delay) + delays = url_delays(cutoff_time) if urls: url_data = dict((item['id'], item) for item in url_data) @@ -97,7 +108,7 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): # fake the standard deviation for local testing setups if vendor == 'sqlite': setattr(url, 'duration_stddev', 0.0) - annotate_url(url, delays) + annotate_url(url, delays.get(url.id, None)) return { 'cutoff': cutoff, -- cgit v1.2.3-2-g168b From c588d1c85f86f5ee10a96bec679111c8675b703c Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sat, 13 Apr 2013 11:38:11 -0500 Subject: Support only a single mirror ID in error/status retrieval This simplifies things and makes injecting this single mirror ID into custom SQL a whole lot easier. Signed-off-by: Dan McGee --- mirrors/utils.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 2721e20e..d18dc22f 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -55,15 +55,15 @@ GROUP BY url_id @cache_function(123) -def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): +def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_id=None): cutoff_time = now() - cutoff valid_urls = MirrorUrl.objects.filter( mirror__active=True, mirror__public=True, logs__check_time__gte=cutoff_time).distinct() - if mirror_ids: - valid_urls = valid_urls.filter(mirror_id__in=mirror_ids) + if mirror_id: + valid_urls = valid_urls.filter(mirror_id=mirror_id) url_data = MirrorUrl.objects.values('id', 'mirror_id').filter( id__in=valid_urls, logs__check_time__gte=cutoff_time).annotate( @@ -79,7 +79,7 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( id__in=valid_urls).order_by('mirror__id', 'url') - delays = url_delays(cutoff_time) + delays = url_delays(cutoff_time, mirror_id) if urls: url_data = dict((item['id'], item) for item in url_data) @@ -90,8 +90,8 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): last_check = max([u.last_check for u in urls]) num_checks = max([u.check_count for u in urls]) check_info = MirrorLog.objects.filter(check_time__gte=cutoff_time) - if mirror_ids: - check_info = check_info.filter(url__mirror_id__in=mirror_ids) + if mirror_id: + check_info = check_info.filter(url__mirror_id=mirror_id) check_info = check_info.aggregate( mn=Min('check_time'), mx=Max('check_time')) if num_checks > 1: @@ -120,7 +120,7 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): @cache_function(117) -def get_mirror_errors(cutoff=DEFAULT_CUTOFF, mirror_ids=None): +def get_mirror_errors(cutoff=DEFAULT_CUTOFF, mirror_id=None): cutoff_time = now() - cutoff errors = MirrorLog.objects.filter( is_success=False, check_time__gte=cutoff_time, @@ -130,8 +130,8 @@ def get_mirror_errors(cutoff=DEFAULT_CUTOFF, mirror_ids=None): error_count=Count('error'), last_occurred=Max('check_time') ).order_by('-last_occurred', '-error_count') - if mirror_ids: - urls = urls.filter(mirror_id__in=mirror_ids) + if mirror_id: + urls = urls.filter(mirror_id=mirror_id) errors = list(errors) for err in errors: -- cgit v1.2.3-2-g168b From 213aa3a2fab6f3a56be348a067c132f568efbaff Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sun, 14 Apr 2013 13:09:41 -0500 Subject: Reduce mirror status query madness Move completely to custom SQL for this logic. The Django ORM just doesn't play nice with the kind of query we are looking to do, so it is easier to do using raw SQL. The biggest pain factor here is in supporting sqlite as it doesn't have nearly the capabilities in handling datetime types directly in the database, as well as having some different type conversion necessities. Signed-off-by: Dan McGee --- mirrors/utils.py | 146 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 91 insertions(+), 55 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index d18dc22f..eb1211f1 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -2,6 +2,7 @@ from datetime import timedelta from django.db import connection from django.db.models import Avg, Count, Max, Min, StdDev +from django.utils.dateparse import parse_datetime from django.utils.timezone import now from django_countries.fields import Country @@ -11,47 +12,103 @@ from .models import MirrorLog, MirrorProtocol, MirrorUrl DEFAULT_CUTOFF = timedelta(hours=24) -def annotate_url(url, delay): - '''Given a MirrorURL object, add a few more attributes to it regarding - status, including completion_pct, delay, and score.''' - url.completion_pct = float(url.success_count) / url.check_count - if delay is not None: - url.delay = delay - hours = url.delay.days * 24.0 + url.delay.seconds / 3600.0 - if url.completion_pct > 0: - divisor = url.completion_pct - else: - # arbitrary small value - divisor = 0.005 - url.score = (hours + url.duration_avg + url.duration_stddev) / divisor - else: - url.delay = None - url.score = None +def dictfetchall(cursor): + "Returns all rows from a cursor as a dict." + desc = cursor.description + return [ + dict(zip([col[0] for col in desc], row)) + for row in cursor.fetchall() + ] -def url_delays(cutoff_time, mirror_id=None): - cursor = connection.cursor() - if mirror_id is None: - sql= """ -SELECT url_id, AVG(check_time - last_sync) -FROM mirrors_mirrorlog -WHERE is_success = %s AND check_time >= %s AND last_sync IS NOT NULL -GROUP BY url_id +def status_data(cutoff_time, mirror_id=None): + if mirror_id is not None: + params = [cutoff_time, mirror_id] + mirror_where = 'AND u.mirror_id = %s' + else: + params = [cutoff_time] + mirror_where = '' + + vendor = database_vendor(MirrorUrl) + if vendor == 'sqlite': + sql = """ +SELECT l.url_id, u.mirror_id, + COUNT(l.id) AS check_count, + COUNT(l.duration) AS success_count, + MAX(l.last_sync) AS last_sync, + MAX(l.check_time) AS last_check, + AVG(l.duration) AS duration_avg, + 0.0 AS duration_stddev, + AVG(STRFTIME('%%s', check_time) - STRFTIME('%%s', last_sync)) AS delay +FROM mirrors_mirrorlog l +JOIN mirrors_mirrorurl u ON u.id = l.url_id +WHERE l.check_time >= %s +""" + mirror_where + """ +GROUP BY l.url_id, u.mirror_id """ - cursor.execute(sql, [True, cutoff_time]) else: sql = """ -SELECT l.url_id, avg(check_time - last_sync) +SELECT l.url_id, u.mirror_id, + COUNT(l.id) AS check_count, + COUNT(l.duration) AS success_count, + MAX(l.last_sync) AS last_sync, + MAX(l.check_time) AS last_check, + AVG(l.duration) AS duration_avg, + STDDEV(l.duration) AS duration_stddev, + AVG(check_time - last_sync) AS delay FROM mirrors_mirrorlog l JOIN mirrors_mirrorurl u ON u.id = l.url_id -WHERE is_success = %s AND check_time >= %s AND last_sync IS NOT NULL -AND mirror_id = %s -GROUP BY url_id +WHERE l.check_time >= %s +""" + mirror_where + """ +GROUP BY l.url_id, u.mirror_id """ - cursor.execute(sql, [True, cutoff_time, mirror_id]) - return {url_id: delay for url_id, delay in cursor.fetchall()} + cursor = connection.cursor() + cursor.execute(sql, params) + url_data = dictfetchall(cursor) + + # sqlite loves to return less than ideal types + if vendor == 'sqlite': + for item in url_data: + item['delay'] = timedelta(seconds=item['delay']) + item['last_sync'] = parse_datetime(item['last_sync']) + item['last_check'] = parse_datetime(item['last_check']) + + return {item['url_id']: item for item in url_data} + + +def annotate_url(url, url_data): + '''Given a MirrorURL object, add a few more attributes to it regarding + status, including completion_pct, delay, and score.''' + known_attrs = ( + ('success_count', 0), + ('check_count', 0), + ('completion_pct', None), + ('last_check', None), + ('last_sync', None), + ('delay', None), + ('score', None), + ) + for k, v in known_attrs: + setattr(url, k, v) + for k, v in url_data.items(): + if k not in ('url_id', 'mirror_id'): + setattr(url, k, v) + + if url.check_count > 0: + url.completion_pct = float(url.success_count) / url.check_count + + if url.delay is not None: + hours = url.delay.days * 24.0 + url.delay.seconds / 3600.0 + + if url.completion_pct > 0: + divisor = url.completion_pct + else: + # arbitrary small value + divisor = 0.005 + stddev = url.duration_stddev or 0.0 + url.score = (hours + url.duration_avg + stddev) / divisor @cache_function(123) @@ -65,29 +122,14 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_id=None): if mirror_id: valid_urls = valid_urls.filter(mirror_id=mirror_id) - url_data = MirrorUrl.objects.values('id', 'mirror_id').filter( - id__in=valid_urls, logs__check_time__gte=cutoff_time).annotate( - check_count=Count('logs'), - success_count=Count('logs__duration'), - last_sync=Max('logs__last_sync'), - last_check=Max('logs__check_time'), - duration_avg=Avg('logs__duration')) - - vendor = database_vendor(MirrorUrl) - if vendor != 'sqlite': - url_data = url_data.annotate(duration_stddev=StdDev('logs__duration')) - + url_data = status_data(cutoff_time, mirror_id) urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( id__in=valid_urls).order_by('mirror__id', 'url') - delays = url_delays(cutoff_time, mirror_id) if urls: - url_data = dict((item['id'], item) for item in url_data) for url in urls: - for k, v in url_data.get(url.id, {}).items(): - if k not in ('id', 'mirror_id'): - setattr(url, k, v) - last_check = max([u.last_check for u in urls]) + annotate_url(url, url_data.get(url.id, {})) + last_check = max([u.last_check for u in urls if u.last_check]) num_checks = max([u.check_count for u in urls]) check_info = MirrorLog.objects.filter(check_time__gte=cutoff_time) if mirror_id: @@ -104,12 +146,6 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_id=None): num_checks = 0 check_frequency = None - for url in urls: - # fake the standard deviation for local testing setups - if vendor == 'sqlite': - setattr(url, 'duration_stddev', 0.0) - annotate_url(url, delays.get(url.id, None)) - return { 'cutoff': cutoff, 'last_check': last_check, -- cgit v1.2.3-2-g168b From f357a39a49a8edc713d512976a0be2a2a8ac5c4f Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sun, 14 Apr 2013 13:21:16 -0500 Subject: Remove cache_function decorator from a few spots The benefit of these storage operations might be outweighed by the cost, especially given how infrequently these functions are called. Signed-off-by: Dan McGee --- mirrors/utils.py | 2 -- 1 file changed, 2 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index eb1211f1..5a8bbf5d 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -111,7 +111,6 @@ def annotate_url(url, url_data): url.score = (hours + url.duration_avg + stddev) / divisor -@cache_function(123) def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_id=None): cutoff_time = now() - cutoff @@ -155,7 +154,6 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_id=None): } -@cache_function(117) def get_mirror_errors(cutoff=DEFAULT_CUTOFF, mirror_id=None): cutoff_time = now() - cutoff errors = MirrorLog.objects.filter( -- cgit v1.2.3-2-g168b From b7b24740640e24883cd17fd683e1d465fbb343f8 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Tue, 16 Apr 2013 22:12:01 -0500 Subject: Various minor code cleanups and fixes Most of these were suggested by PyCharm, and include everything from little syntax issues and other bad smells to dead or bad code. Signed-off-by: Dan McGee --- mirrors/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 5a8bbf5d..531cf005 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -1,13 +1,13 @@ from datetime import timedelta from django.db import connection -from django.db.models import Avg, Count, Max, Min, StdDev +from django.db.models import Count, Max, Min from django.utils.dateparse import parse_datetime from django.utils.timezone import now from django_countries.fields import Country from main.utils import cache_function, database_vendor -from .models import MirrorLog, MirrorProtocol, MirrorUrl +from .models import MirrorLog, MirrorUrl DEFAULT_CUTOFF = timedelta(hours=24) @@ -165,7 +165,7 @@ def get_mirror_errors(cutoff=DEFAULT_CUTOFF, mirror_id=None): ).order_by('-last_occurred', '-error_count') if mirror_id: - urls = urls.filter(mirror_id=mirror_id) + errors = errors.filter(url__mirror_id=mirror_id) errors = list(errors) for err in errors: -- cgit v1.2.3-2-g168b From 6de0cfbd23aae69036439db817cc26740d8796cd Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sat, 20 Apr 2013 11:13:49 -0500 Subject: Fix some None issues with sqlite3 and mirror status If certain attributes came back from the database as NULL, we had issues parsing them. Pass None/NULL straight through rather than trying to type-convert. Signed-off-by: Dan McGee --- mirrors/utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 531cf005..ba45da5f 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -71,8 +71,10 @@ GROUP BY l.url_id, u.mirror_id # sqlite loves to return less than ideal types if vendor == 'sqlite': for item in url_data: - item['delay'] = timedelta(seconds=item['delay']) - item['last_sync'] = parse_datetime(item['last_sync']) + if item['delay'] is not None: + item['delay'] = timedelta(seconds=item['delay']) + if item['last_sync'] is not None: + item['last_sync'] = parse_datetime(item['last_sync']) item['last_check'] = parse_datetime(item['last_check']) return {item['url_id']: item for item in url_data} -- cgit v1.2.3-2-g168b