From 76516cae45f3d1080065608bdb8f2d086322012f Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sun, 13 May 2012 19:45:57 -0500 Subject: Don't limit protocols returned by mirror status function If results weren't available for certain URLs, they won't show up anyway in this list, and if we start to check rsync URLs, then we want their values to come back in this status list. Signed-off-by: Dan McGee --- mirrors/utils.py | 2 -- 1 file changed, 2 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 32fa3587..54de567e 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -32,11 +32,9 @@ def annotate_url(url, delays): @cache_function(123) def get_mirror_statuses(cutoff=default_cutoff): cutoff_time = utc_now() - cutoff - protocols = list(MirrorProtocol.objects.filter(is_download=True)) # I swear, this actually has decent performance... urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( mirror__active=True, mirror__public=True, - protocol__in=protocols, logs__check_time__gte=cutoff_time).annotate( check_count=Count('logs'), success_count=Count('logs__duration'), -- cgit v1.2.3-2-g168b From ae1c526ffbe908322f0dd8d8805360b81ab22b0f Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sun, 13 May 2012 20:35:50 -0500 Subject: Add ability to restrict status report to single tier This should make it easier to catch errors in our Tier 1 mirrors. Signed-off-by: Dan McGee --- mirrors/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 54de567e..2014411d 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -88,7 +88,7 @@ def get_mirror_errors(cutoff=default_cutoff): is_success=False, check_time__gte=cutoff_time, url__mirror__active=True, url__mirror__public=True).values( 'url__url', 'url__country', 'url__protocol__protocol', - 'url__mirror__country', 'error').annotate( + 'url__mirror__country', 'url__mirror__tier', 'error').annotate( error_count=Count('error'), last_occurred=Max('check_time') ).order_by('-last_occurred', '-error_count') errors = list(errors) -- cgit v1.2.3-2-g168b From 0f3c894e7a0f573fa0198459150f387c3a7f23ae Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sun, 8 Jul 2012 20:38:01 -0500 Subject: Don't include StdDev on sqlite3 mirror status query Because this function isn't shipped by default, it makes more sense to just omit it completely from the query we do to build the tables on this page when in development. Substitute 0.0 for the value so the rest of the calculations and display work as expected. Signed-off-by: Dan McGee --- mirrors/utils.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 2014411d..9aa8e0f5 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -3,7 +3,7 @@ from datetime import timedelta from django.db.models import Avg, Count, Max, Min, StdDev from django_countries.fields import Country -from main.utils import cache_function, utc_now +from main.utils import cache_function, utc_now, database_vendor from .models import MirrorLog, MirrorProtocol, MirrorUrl @@ -40,8 +40,11 @@ def get_mirror_statuses(cutoff=default_cutoff): success_count=Count('logs__duration'), last_sync=Max('logs__last_sync'), last_check=Max('logs__check_time'), - duration_avg=Avg('logs__duration'), - duration_stddev=StdDev('logs__duration')) + duration_avg=Avg('logs__duration')) + + vendor = database_vendor(MirrorUrl) + if vendor != 'sqlite': + urls.annotate(duration_stddev=StdDev('logs__duration')) # The Django ORM makes it really hard to get actual average delay in the # above query, so run a seperate query for it and we will process the @@ -70,6 +73,9 @@ def get_mirror_statuses(cutoff=default_cutoff): check_frequency = None for url in urls: + # fake the standard deviation for local testing setups + if vendor == 'sqlite': + setattr(url, 'duration_stddev', 0.0) annotate_url(url, delays) return { -- cgit v1.2.3-2-g168b From 9c7350650e66b5eb6228778e335a160be5ea7f98 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sun, 8 Jul 2012 21:53:35 -0500 Subject: Correctly reassign queryset with added annotation in mirror status This was a dumb oversight on my part in commit 0f3c894e7a0. Signed-off-by: Dan McGee --- mirrors/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 9aa8e0f5..f2c98ee0 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -44,7 +44,7 @@ def get_mirror_statuses(cutoff=default_cutoff): vendor = database_vendor(MirrorUrl) if vendor != 'sqlite': - urls.annotate(duration_stddev=StdDev('logs__duration')) + urls = urls.annotate(duration_stddev=StdDev('logs__duration')) # The Django ORM makes it really hard to get actual average delay in the # above query, so run a seperate query for it and we will process the -- cgit v1.2.3-2-g168b From c0bf9e20660cfae7ea8994472555bba23398b598 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Tue, 24 Jul 2012 09:19:48 -0500 Subject: Remove custom utc_now() function, use django.utils.timezone.now() This was around from the time when we handled timezones sanely and Django did not; now that we are on 1.4 we no longer need our own code to handle this. Signed-off-by: Dan McGee --- mirrors/utils.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index f2c98ee0..bf030d39 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -1,13 +1,14 @@ from datetime import timedelta from django.db.models import Avg, Count, Max, Min, StdDev +from django.utils.timezone import now from django_countries.fields import Country -from main.utils import cache_function, utc_now, database_vendor +from main.utils import cache_function, database_vendor from .models import MirrorLog, MirrorProtocol, MirrorUrl -default_cutoff = timedelta(hours=24) +DEFAULT_CUTOFF = timedelta(hours=24) def annotate_url(url, delays): '''Given a MirrorURL object, add a few more attributes to it regarding @@ -30,8 +31,8 @@ def annotate_url(url, delays): @cache_function(123) -def get_mirror_statuses(cutoff=default_cutoff): - cutoff_time = utc_now() - cutoff +def get_mirror_statuses(cutoff=DEFAULT_CUTOFF): + cutoff_time = now() - cutoff # I swear, this actually has decent performance... urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( mirror__active=True, mirror__public=True, @@ -88,8 +89,8 @@ def get_mirror_statuses(cutoff=default_cutoff): @cache_function(117) -def get_mirror_errors(cutoff=default_cutoff): - cutoff_time = utc_now() - cutoff +def get_mirror_errors(cutoff=DEFAULT_CUTOFF): + cutoff_time = now() - cutoff errors = MirrorLog.objects.filter( is_success=False, check_time__gte=cutoff_time, url__mirror__active=True, url__mirror__public=True).values( @@ -105,11 +106,11 @@ def get_mirror_errors(cutoff=default_cutoff): @cache_function(295) -def get_mirror_url_for_download(cutoff=default_cutoff): +def get_mirror_url_for_download(cutoff=DEFAULT_CUTOFF): '''Find a good mirror URL to use for package downloads. If we have mirror status data available, it is used to determine a good choice by looking at the last batch of status rows.''' - cutoff_time = utc_now() - cutoff + cutoff_time = now() - cutoff status_data = MirrorLog.objects.filter( check_time__gte=cutoff_time).aggregate( Max('check_time'), Max('last_sync')) -- cgit v1.2.3-2-g168b From f0b7e73de61c03a5018ed352605e6329611448d2 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Wed, 10 Oct 2012 20:17:55 -0500 Subject: Make mirror log time query a bit more efficient We don't need the full mirror log objects; we just need a very small subset of values from them here to do the required math and object building. Signed-off-by: Dan McGee --- mirrors/utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index bf030d39..0a32b766 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -50,12 +50,14 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF): # The Django ORM makes it really hard to get actual average delay in the # above query, so run a seperate query for it and we will process the # results here. - times = MirrorLog.objects.filter(is_success=True, last_sync__isnull=False, + times = MirrorLog.objects.values_list( + 'url_id', 'check_time', 'last_sync').filter( + is_success=True, last_sync__isnull=False, check_time__gte=cutoff_time) delays = {} - for log in times: - delay = log.check_time - log.last_sync - delays.setdefault(log.url_id, []).append(delay) + for url_id, check_time, last_sync in times: + delay = check_time - last_sync + delays.setdefault(url_id, []).append(delay) if urls: last_check = max([u.last_check for u in urls]) -- cgit v1.2.3-2-g168b From 86102c6e645451c03e3e576060eba7f93350bf6b Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sat, 10 Nov 2012 14:19:23 -0600 Subject: Allow filtering retrieved mirror statuses by mirror_id When we don't need them all, no need to fetch them all. Let the database do the work for us, hopefully. Signed-off-by: Dan McGee --- mirrors/utils.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 0a32b766..ba027c99 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -31,7 +31,7 @@ def annotate_url(url, delays): @cache_function(123) -def get_mirror_statuses(cutoff=DEFAULT_CUTOFF): +def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): cutoff_time = now() - cutoff # I swear, this actually has decent performance... urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( @@ -43,6 +43,9 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF): last_check=Max('logs__check_time'), duration_avg=Avg('logs__duration')) + if mirror_ids: + urls = urls.filter(mirror_id__in=mirror_ids) + vendor = database_vendor(MirrorUrl) if vendor != 'sqlite': urls = urls.annotate(duration_stddev=StdDev('logs__duration')) @@ -54,6 +57,8 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF): 'url_id', 'check_time', 'last_sync').filter( is_success=True, last_sync__isnull=False, check_time__gte=cutoff_time) + if mirror_ids: + times = times.filter(url__mirror_id__in=mirror_ids) delays = {} for url_id, check_time, last_sync in times: delay = check_time - last_sync @@ -62,8 +67,10 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF): if urls: last_check = max([u.last_check for u in urls]) num_checks = max([u.check_count for u in urls]) - check_info = MirrorLog.objects.filter( - check_time__gte=cutoff_time).aggregate( + check_info = MirrorLog.objects.filter(check_time__gte=cutoff_time) + if mirror_ids: + check_info = check_info.filter(url__mirror_id__in=mirror_ids) + check_info = check_info.aggregate( mn=Min('check_time'), mx=Max('check_time')) if num_checks > 1: check_frequency = (check_info['mx'] - check_info['mn']) \ @@ -91,7 +98,7 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF): @cache_function(117) -def get_mirror_errors(cutoff=DEFAULT_CUTOFF): +def get_mirror_errors(cutoff=DEFAULT_CUTOFF, mirror_ids=None): cutoff_time = now() - cutoff errors = MirrorLog.objects.filter( is_success=False, check_time__gte=cutoff_time, @@ -100,6 +107,10 @@ def get_mirror_errors(cutoff=DEFAULT_CUTOFF): 'url__mirror__country', 'url__mirror__tier', 'error').annotate( error_count=Count('error'), last_occurred=Max('check_time') ).order_by('-last_occurred', '-error_count') + + if mirror_ids: + urls = urls.filter(mirror_id__in=mirror_ids) + errors = list(errors) for err in errors: ctry_code = err['url__country'] or err['url__mirror__country'] -- cgit v1.2.3-2-g168b From e26d5722289bd2e972633891d8dac09296b0cbc4 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sun, 11 Nov 2012 14:55:37 -0600 Subject: Mirror graph tweaking after usage with real data * Clamp y-axis minimum to 0. * Don't plot `is_success == false` values. * Ensure URLs are sorted predictably. Signed-off-by: Dan McGee --- mirrors/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index ba027c99..85e4ee93 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -41,7 +41,8 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): success_count=Count('logs__duration'), last_sync=Max('logs__last_sync'), last_check=Max('logs__check_time'), - duration_avg=Avg('logs__duration')) + duration_avg=Avg('logs__duration')).order_by( + 'mirror', 'url') if mirror_ids: urls = urls.filter(mirror_id__in=mirror_ids) -- cgit v1.2.3-2-g168b From a2cfa7edbbed8edb1ad4d3391c6edb055c13de1b Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Fri, 16 Nov 2012 15:39:56 -0600 Subject: Optimize mirror status data fetching Now that we have as many mirror URLs as we do, we can do a better job fetching and aggregating this data. The prior method resulted in a rather unwieldy query being pushed down to the database with a horrendously long GROUP BY clause. Instead of trying to group by everything at once so we can retrieve mirror URL info at the same time, separate the two queries- one for getting URL performance data, one for the qualitative data. The impetus behind fixing this is the PostgreSQL slow query log in production; this currently shows up the most of any queries we run in the system. Signed-off-by: Dan McGee --- mirrors/utils.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 85e4ee93..07a7138f 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -33,23 +33,26 @@ def annotate_url(url, delays): @cache_function(123) def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): cutoff_time = now() - cutoff - # I swear, this actually has decent performance... - urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( + url_data = MirrorUrl.objects.values('id', 'mirror_id').filter( mirror__active=True, mirror__public=True, logs__check_time__gte=cutoff_time).annotate( check_count=Count('logs'), success_count=Count('logs__duration'), last_sync=Max('logs__last_sync'), last_check=Max('logs__check_time'), - duration_avg=Avg('logs__duration')).order_by( - 'mirror', 'url') - - if mirror_ids: - urls = urls.filter(mirror_id__in=mirror_ids) + duration_avg=Avg('logs__duration')) vendor = database_vendor(MirrorUrl) if vendor != 'sqlite': - urls = urls.annotate(duration_stddev=StdDev('logs__duration')) + url_data = url_data.annotate(duration_stddev=StdDev('logs__duration')) + + urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( + mirror__active=True, mirror__public=True, + logs__check_time__gte=cutoff_time).order_by('mirror__id', 'url') + + if mirror_ids: + url_data = url_data.filter(mirror_id__in=mirror_ids) + urls = urls.filter(mirror_id__in=mirror_ids) # The Django ORM makes it really hard to get actual average delay in the # above query, so run a seperate query for it and we will process the @@ -66,6 +69,11 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): delays.setdefault(url_id, []).append(delay) if urls: + url_data = dict((item['id'], item) for item in url_data) + for url in urls: + for k, v in url_data.get(url.id, {}).items(): + if k not in ('id', 'mirror_id'): + setattr(url, k, v) last_check = max([u.last_check for u in urls]) num_checks = max([u.check_count for u in urls]) check_info = MirrorLog.objects.filter(check_time__gte=cutoff_time) -- cgit v1.2.3-2-g168b From f0f6f7235a62186c1cae9c79036dde5d8821373d Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Mon, 19 Nov 2012 08:10:21 -0600 Subject: Fix mirror URL duplication in status view We need to ensure we don't duplicate URLs in the status view, so add a distinct() call back in to the queryset when it was inadvertently dropped in commit a2cfa7edbb. This negates a lot of the performance gains we had, unfortunately, so it looks like a nested subquery might be more efficient. Disappointing the planner can't do this for us. Signed-off-by: Dan McGee --- mirrors/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 07a7138f..a62c7f05 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -48,7 +48,8 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( mirror__active=True, mirror__public=True, - logs__check_time__gte=cutoff_time).order_by('mirror__id', 'url') + logs__check_time__gte=cutoff_time).distinct().order_by( + 'mirror__id', 'url') if mirror_ids: url_data = url_data.filter(mirror_id__in=mirror_ids) -- cgit v1.2.3-2-g168b From bec73c7a37c07821f145dbcf11435d4f2b94a149 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sat, 12 Jan 2013 15:56:36 -0600 Subject: Round two of mirror status query improvements This seems to generate much more performant queries at the database level than what we were previously doing, and also doesn't show duplicate rows. Signed-off-by: Dan McGee --- mirrors/utils.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index a62c7f05..1d560021 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -33,9 +33,16 @@ def annotate_url(url, delays): @cache_function(123) def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): cutoff_time = now() - cutoff - url_data = MirrorUrl.objects.values('id', 'mirror_id').filter( + + valid_urls = MirrorUrl.objects.filter( mirror__active=True, mirror__public=True, - logs__check_time__gte=cutoff_time).annotate( + logs__check_time__gte=cutoff_time).distinct() + + if mirror_ids: + valid_urls = valid_urls.filter(mirror_id__in=mirror_ids) + + url_data = MirrorUrl.objects.values('id', 'mirror_id').filter( + id__in=valid_urls, logs__check_time__gte=cutoff_time).annotate( check_count=Count('logs'), success_count=Count('logs__duration'), last_sync=Max('logs__last_sync'), @@ -47,13 +54,7 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): url_data = url_data.annotate(duration_stddev=StdDev('logs__duration')) urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( - mirror__active=True, mirror__public=True, - logs__check_time__gte=cutoff_time).distinct().order_by( - 'mirror__id', 'url') - - if mirror_ids: - url_data = url_data.filter(mirror_id__in=mirror_ids) - urls = urls.filter(mirror_id__in=mirror_ids) + id__in=valid_urls).order_by('mirror__id', 'url') # The Django ORM makes it really hard to get actual average delay in the # above query, so run a seperate query for it and we will process the -- cgit v1.2.3-2-g168b From 6f0ae6746baea657ee6d7c21ac0813a04f825443 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Mon, 14 Jan 2013 01:00:11 -0600 Subject: Drop country column from mirror table We now always look for this information at the URL level, not the mirror level. This simplifies quite a bit of code in and around the mirror views. Signed-off-by: Dan McGee --- mirrors/utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mirrors/utils.py') diff --git a/mirrors/utils.py b/mirrors/utils.py index 1d560021..3ab176b3 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -115,7 +115,7 @@ def get_mirror_errors(cutoff=DEFAULT_CUTOFF, mirror_ids=None): is_success=False, check_time__gte=cutoff_time, url__mirror__active=True, url__mirror__public=True).values( 'url__url', 'url__country', 'url__protocol__protocol', - 'url__mirror__country', 'url__mirror__tier', 'error').annotate( + 'url__mirror__tier', 'error').annotate( error_count=Count('error'), last_occurred=Max('check_time') ).order_by('-last_occurred', '-error_count') @@ -124,8 +124,7 @@ def get_mirror_errors(cutoff=DEFAULT_CUTOFF, mirror_ids=None): errors = list(errors) for err in errors: - ctry_code = err['url__country'] or err['url__mirror__country'] - err['country'] = Country(ctry_code) + err['country'] = Country(err['url__country']) return errors @@ -152,7 +151,7 @@ def get_mirror_url_for_download(cutoff=DEFAULT_CUTOFF): mirror_urls = MirrorUrl.objects.filter( mirror__public=True, mirror__active=True, protocol__default=True) # look first for a country-agnostic URL, then fall back to any HTTP URL - filtered_urls = mirror_urls.filter(mirror__country='')[:1] + filtered_urls = mirror_urls.filter(country='')[:1] if not filtered_urls: filtered_urls = mirror_urls[:1] if not filtered_urls: -- cgit v1.2.3-2-g168b