From 9eb90149c3e128316f98f4107bd0d53fdd33e1bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Gryglicki?= Date: Mon, 31 Jul 2023 05:52:19 +0000 Subject: [PATCH] Update project_company_stats metric to use HLL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ɓukasz Gryglicki --- metrics/shared/project_company_stats.sql | 44 +- .../shared/project_company_stats_nohll.sql | 408 ++++++++++++++++++ 2 files changed, 430 insertions(+), 22 deletions(-) create mode 100644 metrics/shared/project_company_stats_nohll.sql diff --git a/metrics/shared/project_company_stats.sql b/metrics/shared/project_company_stats.sql index c918aefb1..7b4bebe4e 100644 --- a/metrics/shared/project_company_stats.sql +++ b/metrics/shared/project_company_stats.sql @@ -72,14 +72,14 @@ select from ( select 'Commits' as metric, company, - count(distinct sha) as value + round(hll_cardinality(hll_add_agg(hll_hash_text(sha)))) as value from company_commits_data group by company union select 'Committers' as metric, company, - count(distinct actor_id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(actor_id)))) as value from company_commits_data group by @@ -96,7 +96,7 @@ from ( when 'ForkEvent' then 'Forkers' end as metric, af.company_name as company, - count(distinct e.actor_id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(e.actor_id)))) as value from gha_events e, gha_actors_affiliations af @@ -117,7 +117,7 @@ from ( af.company_name union select 'Contributors' as metric, af.company_name as company, - count(distinct e.actor_id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(e.actor_id)))) as value from gha_events e, gha_actors_affiliations af @@ -136,7 +136,7 @@ from ( af.company_name union select 'Contributions' as metric, af.company_name as company, - count(distinct e.id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(e.id)))) as value from gha_events e, gha_actors_affiliations af @@ -155,7 +155,7 @@ from ( af.company_name union select 'Repositories' as metric, af.company_name as company, - count(distinct e.repo_id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(e.repo_id)))) as value from gha_events e, gha_actors_affiliations af @@ -170,7 +170,7 @@ from ( af.company_name union select 'Comments' as metric, af.company_name as company, - count(distinct c.id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(c.id)))) as value from gha_comments c, gha_actors_affiliations af @@ -185,7 +185,7 @@ from ( af.company_name union select 'Commenters' as metric, af.company_name as company, - count(distinct c.user_id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(c.user_id)))) as value from gha_comments c, gha_actors_affiliations af @@ -200,7 +200,7 @@ from ( af.company_name union select 'PR reviews' as metric, af.company_name as company, - count(distinct c.id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(c.id)))) as value from gha_reviews c, gha_actors_affiliations af @@ -215,7 +215,7 @@ from ( af.company_name union select 'Issues' as metric, af.company_name as company, - count(distinct i.id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(i.id)))) as value from gha_issues i, gha_actors_affiliations af @@ -231,7 +231,7 @@ from ( af.company_name union select 'PRs' as metric, af.company_name as company, - count(distinct i.id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(i.id)))) as value from gha_issues i, gha_actors_affiliations af @@ -262,12 +262,12 @@ from ( af.company_name union select 'Commits' as metric, 'All', - count(distinct sha) as value + round(hll_cardinality(hll_add_agg(hll_hash_text(sha)))) as value from commits_data union select 'Committers' as metric, 'All', - count(distinct actor_id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(actor_id)))) as value from commits_data union select case e.type @@ -282,7 +282,7 @@ from ( when 'ForkEvent' then 'Forkers' end as metric, 'All' as company, - count(distinct e.actor_id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(e.actor_id)))) as value from gha_events e where @@ -297,7 +297,7 @@ from ( e.type union select 'Contributors' as metric, 'All' as company, - count(distinct e.actor_id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(e.actor_id)))) as value from gha_events e where @@ -309,7 +309,7 @@ from ( and (lower(e.dup_actor_login) {{exclude_bots}}) union select 'Contributions' as metric, 'All' as company, - count(distinct e.id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(e.id)))) as value from gha_events e where @@ -321,7 +321,7 @@ from ( and (lower(e.dup_actor_login) {{exclude_bots}}) union select 'Repositories' as metric, 'All' as company, - count(distinct e.repo_id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(e.repo_id)))) as value from gha_events e where @@ -329,7 +329,7 @@ from ( and (lower(e.dup_actor_login) {{exclude_bots}}) union select 'Comments' as metric, 'All' as company, - count(distinct c.id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(c.id)))) as value from gha_comments c where @@ -337,7 +337,7 @@ from ( and (lower(c.dup_user_login) {{exclude_bots}}) union select 'Commenters' as metric, 'All' as company, - count(distinct c.user_id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(c.user_id)))) as value from gha_comments c where @@ -345,7 +345,7 @@ from ( and (lower(c.dup_user_login) {{exclude_bots}}) union select 'PR reviews' as metric, 'All' as company, - count(distinct c.id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(c.id)))) as value from gha_reviews c where @@ -353,7 +353,7 @@ from ( and (lower(c.dup_user_login) {{exclude_bots}}) union select 'Issues' as metric, 'All' as company, - count(distinct i.id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(i.id)))) as value from gha_issues i where @@ -362,7 +362,7 @@ from ( and (lower(i.dup_user_login) {{exclude_bots}}) union select 'PRs' as metric, 'All' as company, - count(distinct i.id) as value + round(hll_cardinality(hll_add_agg(hll_hash_bigint(i.id)))) as value from gha_issues i where diff --git a/metrics/shared/project_company_stats_nohll.sql b/metrics/shared/project_company_stats_nohll.sql new file mode 100644 index 000000000..c918aefb1 --- /dev/null +++ b/metrics/shared/project_company_stats_nohll.sql @@ -0,0 +1,408 @@ +with company_commits_data as ( + select c.sha, + c.dup_actor_id as actor_id, + af.company_name as company + from + gha_commits c, + gha_actors_affiliations af + where + c.dup_actor_id = af.actor_id + and af.dt_from <= c.dup_created_at + and af.dt_to > c.dup_created_at + and {{period:c.dup_created_at}} + and (lower(c.dup_actor_login) {{exclude_bots}}) + and af.company_name != '' + union select c.sha, + c.author_id as actor_id, + af.company_name as company + from + gha_commits c, + gha_actors_affiliations af + where + c.author_id is not null + and c.author_id = af.actor_id + and af.dt_from <= c.dup_created_at + and af.dt_to > c.dup_created_at + and {{period:c.dup_created_at}} + and (lower(c.dup_author_login) {{exclude_bots}}) + and af.company_name != '' + union select c.sha, + c.committer_id as actor_id, + af.company_name as company + from + gha_commits c, + gha_actors_affiliations af + where + c.committer_id is not null + and c.committer_id = af.actor_id + and af.dt_from <= c.dup_created_at + and af.dt_to > c.dup_created_at + and {{period:c.dup_created_at}} + and (lower(c.dup_committer_login) {{exclude_bots}}) + and af.company_name != '' +), commits_data as ( + select c.sha, + c.dup_actor_id as actor_id + from + gha_commits c + where + {{period:c.dup_created_at}} + and (lower(c.dup_actor_login) {{exclude_bots}}) + union select c.sha, + c.author_id as actor_id + from + gha_commits c + where + c.author_id is not null + and {{period:c.dup_created_at}} + and (lower(c.dup_author_login) {{exclude_bots}}) + union select c.sha, + c.committer_id as actor_id + from + gha_commits c + where + c.committer_id is not null + and {{period:c.dup_created_at}} + and (lower(c.dup_committer_login) {{exclude_bots}}) +) +select + 'hcom,' || sub.metric as metric, + sub.company as name, + sub.value as value +from ( + select 'Commits' as metric, + company, + count(distinct sha) as value + from + company_commits_data + group by + company + union select 'Committers' as metric, + company, + count(distinct actor_id) as value + from + company_commits_data + group by + company + union select case e.type + when 'IssuesEvent' then 'Issue creators' + when 'PullRequestEvent' then 'PR creators' + when 'PushEvent' then 'Pushers' + when 'PullRequestReviewEvent' then 'PR reviewers' + when 'PullRequestReviewCommentEvent' then 'PR review commenters' + when 'IssueCommentEvent' then 'Issue commenters' + when 'CommitCommentEvent' then 'Commit commenters' + when 'WatchEvent' then 'Watchers' + when 'ForkEvent' then 'Forkers' + end as metric, + af.company_name as company, + count(distinct e.actor_id) as value + from + gha_events e, + gha_actors_affiliations af + where + e.actor_id = af.actor_id + and af.dt_from <= e.created_at + and af.dt_to > e.created_at + and e.type in ( + 'IssuesEvent', 'PullRequestEvent', 'PushEvent', + 'PullRequestReviewCommentEvent', 'IssueCommentEvent', + 'CommitCommentEvent', 'ForkEvent', 'WatchEvent', 'PullRequestReviewEvent' + ) + and {{period:e.created_at}} + and (lower(e.dup_actor_login) {{exclude_bots}}) + and af.company_name != '' + group by + e.type, + af.company_name + union select 'Contributors' as metric, + af.company_name as company, + count(distinct e.actor_id) as value + from + gha_events e, + gha_actors_affiliations af + where + e.actor_id = af.actor_id + and af.dt_from <= e.created_at + and af.dt_to > e.created_at + and e.type in ( + 'PushEvent', 'PullRequestEvent', 'IssuesEvent', 'PullRequestReviewEvent', + 'CommitCommentEvent', 'IssueCommentEvent', 'PullRequestReviewCommentEvent' + ) + and {{period:e.created_at}} + and (lower(e.dup_actor_login) {{exclude_bots}}) + and af.company_name != '' + group by + af.company_name + union select 'Contributions' as metric, + af.company_name as company, + count(distinct e.id) as value + from + gha_events e, + gha_actors_affiliations af + where + e.actor_id = af.actor_id + and af.dt_from <= e.created_at + and af.dt_to > e.created_at + and e.type in ( + 'PushEvent', 'PullRequestEvent', 'IssuesEvent', 'PullRequestReviewEvent', + 'CommitCommentEvent', 'IssueCommentEvent', 'PullRequestReviewCommentEvent' + ) + and {{period:e.created_at}} + and (lower(e.dup_actor_login) {{exclude_bots}}) + and af.company_name != '' + group by + af.company_name + union select 'Repositories' as metric, + af.company_name as company, + count(distinct e.repo_id) as value + from + gha_events e, + gha_actors_affiliations af + where + e.actor_id = af.actor_id + and af.dt_from <= e.created_at + and af.dt_to > e.created_at + and {{period:e.created_at}} + and (lower(e.dup_actor_login) {{exclude_bots}}) + and af.company_name != '' + group by + af.company_name + union select 'Comments' as metric, + af.company_name as company, + count(distinct c.id) as value + from + gha_comments c, + gha_actors_affiliations af + where + c.user_id = af.actor_id + and af.dt_from <= c.created_at + and af.dt_to > c.created_at + and {{period:c.created_at}} + and (lower(c.dup_user_login) {{exclude_bots}}) + and af.company_name != '' + group by + af.company_name + union select 'Commenters' as metric, + af.company_name as company, + count(distinct c.user_id) as value + from + gha_comments c, + gha_actors_affiliations af + where + c.user_id = af.actor_id + and af.dt_from <= c.created_at + and af.dt_to > c.created_at + and {{period:c.created_at}} + and (lower(c.dup_user_login) {{exclude_bots}}) + and af.company_name != '' + group by + af.company_name + union select 'PR reviews' as metric, + af.company_name as company, + count(distinct c.id) as value + from + gha_reviews c, + gha_actors_affiliations af + where + c.user_id = af.actor_id + and af.dt_from <= c.submitted_at + and af.dt_to > c.submitted_at + and {{period:c.submitted_at}} + and (lower(c.dup_user_login) {{exclude_bots}}) + and af.company_name != '' + group by + af.company_name + union select 'Issues' as metric, + af.company_name as company, + count(distinct i.id) as value + from + gha_issues i, + gha_actors_affiliations af + where + i.user_id = af.actor_id + and af.dt_from <= i.created_at + and af.dt_to > i.created_at + and {{period:i.created_at}} + and i.is_pull_request = false + and (lower(i.dup_user_login) {{exclude_bots}}) + and af.company_name != '' + group by + af.company_name + union select 'PRs' as metric, + af.company_name as company, + count(distinct i.id) as value + from + gha_issues i, + gha_actors_affiliations af + where + i.user_id = af.actor_id + and af.dt_from <= i.created_at + and af.dt_to > i.created_at + and {{period:i.created_at}} + and i.is_pull_request = true + and (lower(i.dup_user_login) {{exclude_bots}}) + and af.company_name != '' + group by + af.company_name + union select 'Events' as metric, + af.company_name as company, + count(e.id) as value + from + gha_events e, + gha_actors_affiliations af + where + e.actor_id = af.actor_id + and af.dt_from <= e.created_at + and af.dt_to > e.created_at + and {{period:e.created_at}} + and (lower(e.dup_actor_login) {{exclude_bots}}) + and af.company_name != '' + group by + af.company_name + union select 'Commits' as metric, + 'All', + count(distinct sha) as value + from + commits_data + union select 'Committers' as metric, + 'All', + count(distinct actor_id) as value + from + commits_data + union select case e.type + when 'IssuesEvent' then 'Issue creators' + when 'PullRequestEvent' then 'PR creators' + when 'PushEvent' then 'Pushers' + when 'PullRequestReviewEvent' then 'PR reviewers' + when 'PullRequestReviewCommentEvent' then 'PR review commenters' + when 'IssueCommentEvent' then 'Issue commenters' + when 'CommitCommentEvent' then 'Commit commenters' + when 'WatchEvent' then 'Watchers' + when 'ForkEvent' then 'Forkers' + end as metric, + 'All' as company, + count(distinct e.actor_id) as value + from + gha_events e + where + e.type in ( + 'IssuesEvent', 'PullRequestEvent', 'PushEvent', + 'PullRequestReviewCommentEvent', 'IssueCommentEvent', + 'CommitCommentEvent', 'ForkEvent', 'WatchEvent', 'PullRequestReviewEvent' + ) + and {{period:e.created_at}} + and (lower(e.dup_actor_login) {{exclude_bots}}) + group by + e.type + union select 'Contributors' as metric, + 'All' as company, + count(distinct e.actor_id) as value + from + gha_events e + where + e.type in ( + 'PushEvent', 'PullRequestEvent', 'IssuesEvent', 'PullRequestReviewEvent', + 'CommitCommentEvent', 'IssueCommentEvent', 'PullRequestReviewCommentEvent' + ) + and {{period:e.created_at}} + and (lower(e.dup_actor_login) {{exclude_bots}}) + union select 'Contributions' as metric, + 'All' as company, + count(distinct e.id) as value + from + gha_events e + where + e.type in ( + 'PushEvent', 'PullRequestEvent', 'IssuesEvent', 'PullRequestReviewEvent', + 'CommitCommentEvent', 'IssueCommentEvent', 'PullRequestReviewCommentEvent' + ) + and {{period:e.created_at}} + and (lower(e.dup_actor_login) {{exclude_bots}}) + union select 'Repositories' as metric, + 'All' as company, + count(distinct e.repo_id) as value + from + gha_events e + where + {{period:e.created_at}} + and (lower(e.dup_actor_login) {{exclude_bots}}) + union select 'Comments' as metric, + 'All' as company, + count(distinct c.id) as value + from + gha_comments c + where + {{period:c.created_at}} + and (lower(c.dup_user_login) {{exclude_bots}}) + union select 'Commenters' as metric, + 'All' as company, + count(distinct c.user_id) as value + from + gha_comments c + where + {{period:c.created_at}} + and (lower(c.dup_user_login) {{exclude_bots}}) + union select 'PR reviews' as metric, + 'All' as company, + count(distinct c.id) as value + from + gha_reviews c + where + {{period:c.submitted_at}} + and (lower(c.dup_user_login) {{exclude_bots}}) + union select 'Issues' as metric, + 'All' as company, + count(distinct i.id) as value + from + gha_issues i + where + {{period:i.created_at}} + and i.is_pull_request = false + and (lower(i.dup_user_login) {{exclude_bots}}) + union select 'PRs' as metric, + 'All' as company, + count(distinct i.id) as value + from + gha_issues i + where + {{period:i.created_at}} + and i.is_pull_request = true + and (lower(i.dup_user_login) {{exclude_bots}}) + union select 'Events' as metric, + 'All' as company, + count(e.id) as value + from + gha_events e + where + {{period:e.created_at}} + and (lower(e.dup_actor_login) {{exclude_bots}}) + ) sub +where + (sub.metric = 'Commenters' and sub.value > 2 * {{project_scale}} * sqrt({{range}}/1450.0)) + or (sub.metric = 'Comments' and sub.value > 3 * {{project_scale}} * sqrt({{range}}/1450.0)) + or (sub.metric = 'PR reviews' and sub.value > 2 * {{project_scale}} * sqrt({{range}}/1450.0)) + or (sub.metric = 'Events' and sub.value > 10 * {{project_scale}} * sqrt({{range}}/1450.0)) + or (sub.metric = 'Forkers' and sub.value > 2 * {{project_scale}} * sqrt({{range}}/1450.0)) + or (sub.metric = 'Issue commenters' and sub.value > 1 * {{project_scale}} * sqrt({{range}}/1450.0)) + or (sub.metric = 'Issue creators' and sub.value > 1 * {{project_scale}} * sqrt({{range}}/1450.0)) + or (sub.metric = 'Issues' and sub.value > 1 * {{project_scale}} * sqrt({{range}}/1450.0)) + or (sub.metric = 'PR creators' and sub.value > 1 * {{project_scale}} * sqrt({{range}}/1450.0)) + or (sub.metric = 'PR reviewers' and sub.value > 1 * {{project_scale}} * sqrt({{range}}/1450.0)) + or (sub.metric = 'PR review commenters' and sub.value > 1 * {{project_scale}} * sqrt({{range}}/1450.0)) + or (sub.metric = 'PRs' and sub.value > 1 * {{project_scale}} * sqrt({{range}}/1450.0)) + or (sub.metric = 'Repositories' and sub.value > 1 * {{project_scale}} * sqrt({{range}}/1450.0)) + or (sub.metric = 'Watchers' and sub.value > 3 * {{project_scale}} * sqrt({{range}}/1450.0)) + or (sub.metric in ( + 'Commit commenters', + 'Commits', + 'Committers', + 'Pushers', + 'Contributors', + 'Contributions' + ) and sub.value > 0.2 * {{project_scale}} * sqrt({{range}}/1450.0)) +order by + metric asc, + value desc, + name asc +;