From 504c783e271c30e998da6de566253e6027f066cf Mon Sep 17 00:00:00 2001 From: Arunodoy18 Date: Wed, 26 Nov 2025 03:03:18 +0530 Subject: [PATCH] Fix company contribution calculation discrepancies in governance data - Add company-level aggregation queries using count(distinct e.id) to both kubernetes and shared project_developer_stats.sql - Prevents double-counting when multiple developers from same company work on same GitHub events - Ensures consistency between companies table and developer activity dashboards for governance decisions - Addresses Istio governance data discrepancy where Google showed 12738 vs 12615 contributions in different views - Fixes potential ranking issues for top 5 company contributions across all CNCF projects Signed-off-by: Arunodoy18 --- .../kubernetes/project_developer_stats.sql | 55 +++++++++++++++++++ metrics/shared/project_developer_stats.sql | 55 +++++++++++++++++++ 2 files changed, 110 insertions(+) diff --git a/metrics/kubernetes/project_developer_stats.sql b/metrics/kubernetes/project_developer_stats.sql index d4c3af49..7cb0c765 100644 --- a/metrics/kubernetes/project_developer_stats.sql +++ b/metrics/kubernetes/project_developer_stats.sql @@ -1184,6 +1184,61 @@ where ) ) -- limit amount of data + +-- Add company-level aggregation using distinct event counting to match companies table logic +-- This prevents discrepancies between individual developer sums and direct company calculations +union select 'hdev_contributions_company,All_All' as metric, + coalesce(aa.company_name, 'Independent') as name, + count(distinct e.id) as value +from + gha_events e +left join + gha_actors_affiliations aa +on + aa.actor_id = e.actor_id + and aa.dt_from <= e.created_at + and aa.dt_to > e.created_at +where + e.type in ( + 'PushEvent', 'PullRequestEvent', 'IssuesEvent', 'PullRequestReviewEvent', + 'CommitCommentEvent', 'IssueCommentEvent', 'PullRequestReviewCommentEvent' + ) + and {{period:e.created_at}} + and (lower(e.dup_actor_login) {{exclude_bots}}) + and coalesce(aa.company_name, '') != '' +group by + aa.company_name +having + count(distinct e.id) >= 30 + +union select 'hdev_contributions_company,' || r.repo_group || '_All' as metric, + coalesce(aa.company_name, 'Independent') as name, + count(distinct e.id) as value +from + gha_events e, + gha_repos r +left join + gha_actors_affiliations aa +on + aa.actor_id = e.actor_id + and aa.dt_from <= e.created_at + and aa.dt_to > e.created_at +where + e.repo_id = r.id + and e.type in ( + 'PushEvent', 'PullRequestEvent', 'IssuesEvent', 'PullRequestReviewEvent', + 'CommitCommentEvent', 'IssueCommentEvent', 'PullRequestReviewCommentEvent' + ) + and {{period:e.created_at}} + and (lower(e.dup_actor_login) {{exclude_bots}}) + and r.repo_group in (select repo_group_name from trepo_groups) + and coalesce(aa.company_name, '') != '' +group by + r.repo_group, + aa.company_name +having + count(distinct e.id) >= 30 + order by metric asc, value desc, diff --git a/metrics/shared/project_developer_stats.sql b/metrics/shared/project_developer_stats.sql index b5c32f3f..709afdc7 100644 --- a/metrics/shared/project_developer_stats.sql +++ b/metrics/shared/project_developer_stats.sql @@ -1135,6 +1135,61 @@ where ) and sub.value > 0.2 * {{project_scale}} * sqrt({{range}}/1450.0) ) -- limit amount of data + +-- Add company-level aggregation using distinct event counting to match companies table logic +-- This prevents discrepancies between individual developer sums and direct company calculations +union select 'hdev_contributions_company,All_All' as metric, + coalesce(aa.company_name, 'Independent') as name, + count(distinct e.id) as value +from + gha_events e +left join + gha_actors_affiliations aa +on + aa.actor_id = e.actor_id + and aa.dt_from <= e.created_at + and aa.dt_to > e.created_at +where + e.type in ( + 'PushEvent', 'PullRequestEvent', 'IssuesEvent', 'PullRequestReviewEvent', + 'CommitCommentEvent', 'IssueCommentEvent', 'PullRequestReviewCommentEvent' + ) + and {{period:e.created_at}} + and (lower(e.dup_actor_login) {{exclude_bots}}) + and coalesce(aa.company_name, '') != '' +group by + aa.company_name +having + count(distinct e.id) >= 30 + +union select 'hdev_contributions_company,' || r.repo_group || '_All' as metric, + coalesce(aa.company_name, 'Independent') as name, + count(distinct e.id) as value +from + gha_events e, + gha_repo_groups r +left join + gha_actors_affiliations aa +on + aa.actor_id = e.actor_id + and aa.dt_from <= e.created_at + and aa.dt_to > e.created_at +where + e.repo_id = r.id + and e.type in ( + 'PushEvent', 'PullRequestEvent', 'IssuesEvent', 'PullRequestReviewEvent', + 'CommitCommentEvent', 'IssueCommentEvent', 'PullRequestReviewCommentEvent' + ) + and {{period:e.created_at}} + and (lower(e.dup_actor_login) {{exclude_bots}}) + and r.repo_group in (select repo_group_name from trepo_groups) + and coalesce(aa.company_name, '') != '' +group by + r.repo_group, + aa.company_name +having + count(distinct e.id) >= 30 + order by metric asc, value desc,