diff --git a/questions/admin.py b/questions/admin.py index e1eda749ef..17eddff3c8 100644 --- a/questions/admin.py +++ b/questions/admin.py @@ -6,6 +6,7 @@ from django.utils.html import format_html from django_better_admin_arrayfield.admin.mixins import DynamicArrayMixin +from questions.types import AggregationMethod from questions.models import ( AggregateForecast, Conditional, @@ -16,6 +17,7 @@ from questions.services import build_question_forecasts from utils.csv_utils import export_all_data_for_questions from utils.models import CustomTranslationAdmin +from questions.constants import ResolutionType @admin.register(Question) @@ -42,6 +44,7 @@ class QuestionAdmin(CustomTranslationAdmin, DynamicArrayMixin): "export_selected_questions_data_anonymized", "rebuild_aggregation_history", "trigger_scoring", + "trigger_scoring_with_all_aggregations", ] list_filter = [ "type", @@ -133,7 +136,10 @@ def trigger_scoring(self, request, queryset: QuerySet[Question]): from scoring.utils import score_question for question in queryset: - if question.resolution in ["", None, "ambiguous", "annulled"]: + if not question.resolution or question.resolution in ( + ResolutionType.AMBIGUOUS, + ResolutionType.ANNULLED, + ): continue score_question( question=question, @@ -142,6 +148,27 @@ def trigger_scoring(self, request, queryset: QuerySet[Question]): trigger_scoring.short_description = "Trigger Scoring (does nothing if not resolved)" + def trigger_scoring_with_all_aggregations( + self, request, queryset: QuerySet[Question] + ): + from scoring.utils import score_question + + for question in queryset: + if not question.resolution or question.resolution in ( + ResolutionType.AMBIGUOUS, + ResolutionType.ANNULLED, + ): + continue + score_question( + question=question, + resolution=question.resolution, + aggregation_methods=list(AggregationMethod._member_map_.values()), + ) + + trigger_scoring_with_all_aggregations.short_description = ( + "Trigger Scoring (Includes ALL Aggregations) (does nothing if not resolved)" + ) + @admin.register(Conditional) class ConditionalAdmin(admin.ModelAdmin): diff --git a/scoring/score_math.py b/scoring/score_math.py index 6fd7aec07a..ead8baaba6 100644 --- a/scoring/score_math.py +++ b/scoring/score_math.py @@ -1,8 +1,6 @@ from dataclasses import dataclass -from datetime import datetime import numpy as np -from django.db.models import Prefetch from scipy.stats.mstats import gmean from questions.models import ( @@ -13,38 +11,29 @@ ) from questions.types import AggregationMethod from scoring.models import Score -from users.models import User from utils.the_math.aggregations import get_aggregation_history @dataclass class AggregationEntry: - pmf: np.ndarray + pmf: np.ndarray | list[float] num_forecasters: int timestamp: float def get_geometric_means( forecasts: list[Forecast | AggregateForecast], - include_bots: bool = False, ) -> list[AggregationEntry]: - included_forecasts = forecasts - if not include_bots: - included_forecasts = [ - f - for f in forecasts - if (isinstance(f, AggregateForecast) or f.author.is_bot is False) - ] geometric_means = [] - timesteps: set[datetime] = set() - for forecast in included_forecasts: + timesteps: set[float] = set() + for forecast in forecasts: timesteps.add(forecast.start_time.timestamp()) if forecast.end_time: timesteps.add(forecast.end_time.timestamp()) for timestep in sorted(timesteps): prediction_values = [ f.get_pmf() - for f in included_forecasts + for f in forecasts if f.start_time.timestamp() <= timestep and (f.end_time is None or f.end_time.timestamp() > timestep) ] @@ -64,7 +53,7 @@ def get_medians( forecasts: list[Forecast | AggregateForecast], ) -> list[AggregationEntry]: medians = [] - timesteps: set[datetime] = set() + timesteps: set[float] = set() for forecast in forecasts: timesteps.add(forecast.start_time.timestamp()) if forecast.end_time: @@ -89,7 +78,7 @@ def get_medians( @dataclass class ForecastScore: score: float - coverage: float = 0 + coverage: float = 0.0 def evaluate_forecasts_baseline_accuracy( @@ -102,7 +91,7 @@ def evaluate_forecasts_baseline_accuracy( open_bounds_count: int, ) -> list[ForecastScore]: total_duration = forecast_horizon_end - forecast_horizon_start - forecast_scores: list[tuple[float, float]] = [] + forecast_scores: list[ForecastScore] = [] for forecast in forecasts: forecast_start = max(forecast.start_time.timestamp(), forecast_horizon_start) forecast_end = ( @@ -140,7 +129,7 @@ def evaluate_forecasts_baseline_spot_forecast( question_type: str, open_bounds_count: int, ) -> list[ForecastScore]: - forecast_scores: list[float] = [] + forecast_scores: list[ForecastScore] = [] for forecast in forecasts: start = forecast.start_time.timestamp() end = ( @@ -173,16 +162,13 @@ def evaluate_forecasts_peer_accuracy( forecast_horizon_end: float, question_type: str, geometric_means: list[AggregationEntry] | None = None, - include_bots_in_aggregates: bool = False, ) -> list[ForecastScore]: base_forecasts = base_forecasts or forecasts - geometric_mean_forecasts = geometric_means or get_geometric_means( - base_forecasts, include_bots_in_aggregates - ) + geometric_mean_forecasts = geometric_means or get_geometric_means(base_forecasts) for gm in geometric_mean_forecasts: gm.timestamp = max(gm.timestamp, forecast_horizon_start) total_duration = forecast_horizon_end - forecast_horizon_start - forecast_scores: list[float] = [] + forecast_scores: list[ForecastScore] = [] for forecast in forecasts: forecast_start = max(forecast.start_time.timestamp(), forecast_horizon_start) forecast_end = ( @@ -209,8 +195,8 @@ def evaluate_forecasts_peer_accuracy( else: interval_scores.append(None) - forecast_score = 0 - forecast_coverage = 0 + forecast_score: float = 0.0 + forecast_coverage: float = 0.0 times = [ gm.timestamp for gm in geometric_mean_forecasts @@ -234,12 +220,9 @@ def evaluate_forecasts_peer_spot_forecast( spot_forecast_timestamp: float, question_type: str, geometric_means: list[AggregationEntry] | None = None, - include_bots_in_aggregates: bool = False, ) -> list[ForecastScore]: base_forecasts = base_forecasts or forecasts - geometric_mean_forecasts = geometric_means or get_geometric_means( - base_forecasts, include_bots_in_aggregates - ) + geometric_mean_forecasts = geometric_means or get_geometric_means(base_forecasts) g = None for gm in geometric_mean_forecasts[::-1]: if gm.timestamp < spot_forecast_timestamp: @@ -248,7 +231,7 @@ def evaluate_forecasts_peer_spot_forecast( if g is None: return [ForecastScore(0)] * len(forecasts) - forecast_scores: list[float] = [] + forecast_scores: list[ForecastScore] = [] for forecast in forecasts: start = forecast.start_time.timestamp() end = ( @@ -285,7 +268,7 @@ def evaluate_forecasts_legacy_relative( for bf in base_forecasts ] total_duration = actual_close_time - forecast_horizon_start - forecast_scores: list[float] = [] + forecast_scores: list[ForecastScore] = [] for forecast in forecasts: forecast_start = max(forecast.start_time.timestamp(), forecast_horizon_start) forecast_end = ( @@ -306,8 +289,8 @@ def evaluate_forecasts_legacy_relative( else: interval_scores.append(None) - forecast_score = 0 - forecast_coverage = 0 + forecast_score: float = 0.0 + forecast_coverage: float = 0.0 times = [ bf.timestamp for bf in baseline_forecasts @@ -329,50 +312,67 @@ def evaluate_question( resolution_bucket: int | None, score_types: list[Score.ScoreTypes], spot_forecast_timestamp: float | None = None, + aggregation_methods: list[AggregationMethod] | None = None, + score_users: bool | list[int] = True, ) -> list[Score]: + aggregation_methods = aggregation_methods or [] + aggregations_to_calculate = aggregation_methods.copy() + if Score.ScoreTypes.RELATIVE_LEGACY in score_types and ( + AggregationMethod.RECENCY_WEIGHTED not in aggregations_to_calculate + ): + aggregations_to_calculate.append(AggregationMethod.RECENCY_WEIGHTED) if resolution_bucket is None: return [] forecast_horizon_start = question.open_time.timestamp() actual_close_time = question.actual_close_time.timestamp() forecast_horizon_end = question.scheduled_close_time.timestamp() - user_forecasts = ( - question.user_forecasts.all() - .prefetch_related(Prefetch("author", queryset=User.objects.only("is_bot"))) - .order_by("start_time") - ) - community_forecasts = get_aggregation_history( + # We need all user forecasts to calculated GeoMean even + # if we're only scoring some or none of the users + user_forecasts = question.user_forecasts.all() + if score_users is True: + scoring_user_forecasts = user_forecasts + elif not score_users: + scoring_user_forecasts = Forecast.objects.none() + else: # we have a list of user ids to score + scoring_user_forecasts = user_forecasts.filter(author_id__in=score_users) + if not question.include_bots_in_aggregates: + user_forecasts = user_forecasts.exclude(author__is_bot=True) + aggregations = get_aggregation_history( question, minimize=False, - aggregation_methods=[AggregationMethod.RECENCY_WEIGHTED], + aggregation_methods=aggregations_to_calculate, include_bots=question.include_bots_in_aggregates, - )[AggregationMethod.RECENCY_WEIGHTED] + include_stats=False, + ) + recency_weighted_aggregation = aggregations.get(AggregationMethod.RECENCY_WEIGHTED) geometric_means: list[AggregationEntry] = [] - ScoreTypes = Score.ScoreTypes - if ScoreTypes.PEER in score_types: - geometric_means = get_geometric_means( - user_forecasts, include_bots=question.include_bots_in_aggregates - ) + if Score.ScoreTypes.PEER in score_types: + geometric_means = get_geometric_means(user_forecasts) scores: list[Score] = [] for score_type in score_types: - match score_type: - case ScoreTypes.BASELINE: - open_bounds_count = bool(question.open_upper_bound) + bool( - question.open_lower_bound - ) - user_scores = evaluate_forecasts_baseline_accuracy( - user_forecasts, - resolution_bucket, - forecast_horizon_start, - actual_close_time, - forecast_horizon_end, - question.type, - open_bounds_count, - ) - community_scores = evaluate_forecasts_baseline_accuracy( - community_forecasts, + aggregation_scores: dict[AggregationMethod, list[Score | ForecastScore]] = ( + dict() + ) + if score_type == Score.ScoreTypes.BASELINE: + open_bounds_count = bool(question.open_upper_bound) + bool( + question.open_lower_bound + ) + user_scores = evaluate_forecasts_baseline_accuracy( + scoring_user_forecasts, + resolution_bucket, + forecast_horizon_start, + actual_close_time, + forecast_horizon_end, + question.type, + open_bounds_count, + ) + for method in aggregation_methods: + aggregation_forecasts = aggregations[method] + aggregation_scores[method] = evaluate_forecasts_baseline_accuracy( + aggregation_forecasts, resolution_bucket, forecast_horizon_start, actual_close_time, @@ -380,37 +380,41 @@ def evaluate_question( question.type, open_bounds_count, ) - case ScoreTypes.SPOT_BASELINE: - open_bounds_count = bool(question.open_upper_bound) + bool( - question.open_lower_bound - ) - user_scores = evaluate_forecasts_baseline_spot_forecast( - user_forecasts, - resolution_bucket, - spot_forecast_timestamp, - question.type, - open_bounds_count, - ) - community_scores = evaluate_forecasts_baseline_spot_forecast( - community_forecasts, + elif score_type == Score.ScoreTypes.SPOT_BASELINE: + open_bounds_count = bool(question.open_upper_bound) + bool( + question.open_lower_bound + ) + user_scores = evaluate_forecasts_baseline_spot_forecast( + scoring_user_forecasts, + resolution_bucket, + spot_forecast_timestamp, + question.type, + open_bounds_count, + ) + for method in aggregation_methods: + aggregation_forecasts = aggregations[method] + aggregation_scores[method] = evaluate_forecasts_baseline_spot_forecast( + aggregation_forecasts, resolution_bucket, spot_forecast_timestamp, question.type, open_bounds_count, ) - case ScoreTypes.PEER: - user_scores = evaluate_forecasts_peer_accuracy( - user_forecasts, - user_forecasts, - resolution_bucket, - forecast_horizon_start, - actual_close_time, - forecast_horizon_end, - question.type, - geometric_means=geometric_means, - ) - community_scores = evaluate_forecasts_peer_accuracy( - community_forecasts, + elif score_type == Score.ScoreTypes.PEER: + user_scores = evaluate_forecasts_peer_accuracy( + scoring_user_forecasts, + user_forecasts, + resolution_bucket, + forecast_horizon_start, + actual_close_time, + forecast_horizon_end, + question.type, + geometric_means=geometric_means, + ) + for method in aggregation_methods: + aggregation_forecasts = aggregations[method] + aggregation_scores[method] = evaluate_forecasts_peer_accuracy( + aggregation_forecasts, user_forecasts, resolution_bucket, forecast_horizon_start, @@ -419,46 +423,50 @@ def evaluate_question( question.type, geometric_means=geometric_means, ) - case ScoreTypes.SPOT_PEER: - user_scores = evaluate_forecasts_peer_spot_forecast( - user_forecasts, - user_forecasts, - resolution_bucket, - spot_forecast_timestamp, - question.type, - geometric_means=geometric_means, - ) - community_scores = evaluate_forecasts_peer_spot_forecast( - community_forecasts, + elif score_type == Score.ScoreTypes.SPOT_PEER: + user_scores = evaluate_forecasts_peer_spot_forecast( + scoring_user_forecasts, + user_forecasts, + resolution_bucket, + spot_forecast_timestamp, + question.type, + geometric_means=geometric_means, + ) + for method in aggregation_methods: + aggregation_forecasts = aggregations[method] + aggregation_scores[method] = evaluate_forecasts_peer_spot_forecast( + aggregation_forecasts, user_forecasts, resolution_bucket, spot_forecast_timestamp, question.type, geometric_means=geometric_means, ) - case ScoreTypes.RELATIVE_LEGACY: - user_scores = evaluate_forecasts_legacy_relative( - user_forecasts, - community_forecasts, - resolution_bucket, - forecast_horizon_start, - actual_close_time, - ) - community_scores = evaluate_forecasts_legacy_relative( - community_forecasts, - community_forecasts, + elif score_type == Score.ScoreTypes.RELATIVE_LEGACY: + user_scores = evaluate_forecasts_legacy_relative( + scoring_user_forecasts, + recency_weighted_aggregation, + resolution_bucket, + forecast_horizon_start, + actual_close_time, + ) + for method in aggregation_methods: + aggregation_forecasts = aggregations[method] + aggregation_scores[method] = evaluate_forecasts_legacy_relative( + aggregation_forecasts, + recency_weighted_aggregation, resolution_bucket, forecast_horizon_start, actual_close_time, ) - case other: - raise NotImplementedError(f"Score type {other} not implemented") + else: + raise NotImplementedError(f"Score type {score_type} not implemented") - user_ids = {forecast.author_id for forecast in user_forecasts} + user_ids = {forecast.author_id for forecast in scoring_user_forecasts} for user_id in user_ids: - user_score = 0 - user_coverage = 0 - for forecast, score in zip(user_forecasts, user_scores): + user_score: float = 0.0 + user_coverage: float = 0.0 + for forecast, score in zip(scoring_user_forecasts, user_scores): if forecast.author_id == user_id: user_score += score.score user_coverage += score.coverage @@ -471,18 +479,20 @@ def evaluate_question( score_type=score_type, ) ) - community_score = 0 - community_coverage = 0 - for score in community_scores: - community_score += score.score - community_coverage += score.coverage - scores.append( - Score( - user=None, - aggregation_method=AggregationMethod.RECENCY_WEIGHTED, - score=community_score, - coverage=community_coverage, - score_type=score_type, + for method in aggregation_methods: + aggregation_score: float = 0.0 + aggregation_coverage: float = 0.0 + community_scores = aggregation_scores[method] + for score in community_scores: + aggregation_score += score.score + aggregation_coverage += score.coverage + scores.append( + Score( + user=None, + aggregation_method=method, + score=aggregation_score, + coverage=aggregation_coverage, + score_type=score_type, + ) ) - ) return scores diff --git a/scoring/utils.py b/scoring/utils.py index ddc7cd6530..0d906b81c8 100644 --- a/scoring/utils.py +++ b/scoring/utils.py @@ -55,7 +55,12 @@ def score_question( resolution: str, spot_scoring_time: float | None = None, score_types: list[str] | None = None, + aggregation_methods: list[AggregationMethod] | None = None, + protect_uncalculated_scores: bool = False, + score_users: bool | list[int] = True, ): + if aggregation_methods is None: + aggregation_methods = [AggregationMethod.RECENCY_WEIGHTED] resolution_bucket = string_location_to_bucket_index(resolution, question) if not spot_scoring_time: if question.spot_scoring_time: @@ -74,23 +79,31 @@ def score_question( for score in previous_scores } new_scores = evaluate_question( - question, - resolution_bucket, - score_types, - spot_scoring_time, + question=question, + resolution_bucket=resolution_bucket, + score_types=score_types, + spot_forecast_timestamp=spot_scoring_time, + aggregation_methods=aggregation_methods, + score_users=score_users, ) + seen = set() for new_score in new_scores: previous_score_id = previous_scores_map.get( (new_score.user_id, new_score.aggregation_method, new_score.score_type) ) + if previous_score_id: + seen.add(previous_score_id) new_score.id = previous_score_id new_score.question = question new_score.edited_at = question.resolution_set_time with transaction.atomic(): - previous_scores.delete() + scores_to_delete = previous_scores + if protect_uncalculated_scores: + scores_to_delete = scores_to_delete.filter(id__in=seen) + scores_to_delete.delete() Score.objects.bulk_create(new_scores, batch_size=500) diff --git a/utils/the_math/aggregations.py b/utils/the_math/aggregations.py index 0df22f11d0..823c2e4c58 100644 --- a/utils/the_math/aggregations.py +++ b/utils/the_math/aggregations.py @@ -154,10 +154,10 @@ def calculate_aggregation_entry( ) aggregation = AggregateForecast(forecast_values=normalized_medians.tolist()) + aggregation.start_time = forecast_set.timestep + aggregation.forecaster_count = len(forecast_set.forecasts_values) if include_stats: forecasts_values = np.array(forecast_set.forecasts_values) - aggregation.start_time = forecast_set.timestep - aggregation.forecaster_count = len(forecast_set.forecasts_values) if question_type in [ Question.QuestionType.BINARY, Question.QuestionType.MULTIPLE_CHOICE,