summaryrefslogtreecommitdiffstats
path: root/.github/scripts/analytics/github_issue_mapping.py
diff options
context:
space:
mode:
authorKirill Rysin <[email protected]>2026-04-08 14:17:46 +0200
committerGitHub <[email protected]>2026-04-08 15:17:46 +0300
commitc7d57dfe0fbf696215040469558aeb2a2fb81d52 (patch)
tree832d7aa6dd49716ea50e26daaf45df238ce7e16c /.github/scripts/analytics/github_issue_mapping.py
parent1864747253932624d9201f3c2fd904cd257e29a4 (diff)
Dev new automerge mute (#37417)
Co-authored-by: Cursor Agent <[email protected]> Co-authored-by: Kirill Rysin <[email protected]>
Diffstat (limited to '.github/scripts/analytics/github_issue_mapping.py')
-rwxr-xr-x.github/scripts/analytics/github_issue_mapping.py293
1 files changed, 253 insertions, 40 deletions
diff --git a/.github/scripts/analytics/github_issue_mapping.py b/.github/scripts/analytics/github_issue_mapping.py
index a97606da449..72aca8b36cc 100755
--- a/.github/scripts/analytics/github_issue_mapping.py
+++ b/.github/scripts/analytics/github_issue_mapping.py
@@ -3,41 +3,68 @@
"""
Create a mapping table between test names and GitHub issues.
This table will be used by SQL queries to join muted test data with GitHub issue information.
+
+area_override logic
+-------------------
+Each muted-test issue has a default owner (from TESTOWNERS via the ``Owner:`` line in the
+issue body). A human can add an ``area/...`` label (e.g. ``area/blobstorage``). If the
+team behind that area (via ``area_to_owner_mapping``) **differs** from the default owner,
+we store the **area path string** in ``area_override``. ``tests_monitor.py`` reads this table
+(plus ``area_to_owner_mapping``) and fills ``effective_area`` / ``effective_owner_team`` on each
+row; downstream marts use those columns.
+
+Edge cases:
+ * No ``area`` in issue ``info`` → area_override = NULL
+ * Area resolves to the same team as the default owner → area_override = NULL
+ * Area not found in mapping → area_override = NULL
+ * Labels change → we always see the *current* ``info`` snapshot
+
+``area_override_since`` (Date)
+-------------------------------
+First ``date_window`` for which datamarts apply ``area_override``. Set from the issue's
+``updated_at`` (UTC date) when the override **value** changes vs the previous row in YDB;
+unchanged override keeps the stored date. ``NULL`` on ``area_override_since`` means there is
+no lower bound: override applies for every ``date_window`` in the mart query (same as omitting
+the check in SQL).
"""
+import datetime as dt
+import json
import os
+import re
import ydb
import time
import sys
from ydb_wrapper import YDBWrapper
-# Import shared GitHub issue utilities
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
-from github_issue_utils import create_test_issue_mapping
-
-
+from github_issue_utils import (
+ create_test_issue_mapping,
+ DEFAULT_BUILD_TYPE,
+ scan_to_utc_date,
+)
def get_github_issues_data(ydb_wrapper):
- """Get GitHub issues data from the issues table"""
- # Get table path from config
+ """Get GitHub issues data from the issues table, including labels info."""
issues_table = ydb_wrapper.get_table_path("issues")
query = f"""
- SELECT
+ SELECT
issue_number,
title,
url,
state,
body,
created_at,
- updated_at
+ updated_at,
+ info
FROM `{issues_table}`
WHERE body IS NOT NULL
AND body != ''
"""
-
+
print("Fetching GitHub issues data...")
-
+
try:
results = ydb_wrapper.execute_scan_query(query)
print(f'Fetched {len(results)} GitHub issues')
@@ -48,10 +75,160 @@ def get_github_issues_data(ydb_wrapper):
return []
+def _norm_area_override_value(v):
+ if v is None:
+ return None
+ s = str(v).strip()
+ return s if s else None
+
+
+def _fetch_existing_github_issue_mapping(ydb_wrapper, table_path: str) -> dict | None:
+ """Return dict keyed by (full_name, branch, build_type, github_issue_number), or None on failure."""
+ try:
+ rows = ydb_wrapper.execute_scan_query(
+ f"""
+ SELECT
+ full_name,
+ branch,
+ build_type,
+ github_issue_number,
+ area_override,
+ area_override_since
+ FROM `{table_path}`
+ """,
+ query_name="github_issue_mapping_read_existing_for_since",
+ )
+ except Exception as e:
+ print(
+ f"Error: cannot read `{table_path}` (add column area_override_since or migrate): {e}",
+ file=sys.stderr,
+ )
+ return None
+ out = {}
+ for r in rows:
+ key = (r["full_name"], r["branch"], r["build_type"], r["github_issue_number"])
+ out[key] = r
+ return out
+
+
+def merge_area_override_since(mapping_data: list, existing_by_key: dict, url_to_updated_at: dict) -> None:
+ """Set area_override_since on each row in place (mutates mapping_data)."""
+ today_utc = dt.datetime.now(dt.timezone.utc).date()
+ for row in mapping_data:
+ key = (
+ row["full_name"],
+ row["branch"],
+ row["build_type"],
+ row["github_issue_number"],
+ )
+ old = existing_by_key.get(key)
+ new_ao = _norm_area_override_value(row.get("area_override"))
+ old_ao = _norm_area_override_value(old.get("area_override")) if old else None
+ url = row.get("github_issue_url") or ""
+ since_from_issue = scan_to_utc_date(url_to_updated_at.get(url))
+ if new_ao is None:
+ row["area_override_since"] = None
+ elif old is None or old_ao != new_ao:
+ row["area_override_since"] = since_from_issue or today_utc
+ else:
+ row["area_override_since"] = scan_to_utc_date(old.get("area_override_since"))
+
+
+def get_area_to_owner_mapping(ydb_wrapper):
+ """Load area -> owner_team mapping from YDB."""
+ try:
+ table_path = ydb_wrapper.get_table_path("area_to_owner_mapping")
+ rows = ydb_wrapper.execute_scan_query(
+ f"SELECT area, owner_team FROM `{table_path}`",
+ query_name="get_area_to_owner_mapping",
+ )
+ mapping = {}
+ for row in rows:
+ area = row.get("area", "")
+ owner = row.get("owner_team", "")
+ if area and owner:
+ mapping[area] = owner
+ print(f"Loaded area_to_owner_mapping: {len(mapping)} entries")
+ return mapping
+ except Exception as e:
+ print(f"Warning: Could not load area_to_owner_mapping: {e}")
+ return {}
+
+
+_OWNER_RE = re.compile(r'Owner:\s*(?:TEAM:@ydb-platform/)?(\S+)', re.IGNORECASE)
+
+
+def _extract_default_owner(body: str) -> str:
+ """Extract the default owner team name from issue body ``Owner:`` line.
+
+ Returns lowercase team name or empty string.
+ """
+ if not body:
+ return ""
+ m = _OWNER_RE.search(body)
+ return m.group(1).lower() if m else ""
+
+
+def _extract_area_from_info(info_raw) -> str:
+ """Extract area/ label value from the ``info`` JSON column.
+
+ Returns the area string (e.g. 'area/queryprocessor') or empty string.
+ """
+ if not info_raw:
+ return ""
+ try:
+ if isinstance(info_raw, str):
+ info = json.loads(info_raw)
+ elif isinstance(info_raw, dict):
+ info = info_raw
+ else:
+ info = json.loads(str(info_raw))
+ except (json.JSONDecodeError, TypeError):
+ return ""
+ return info.get("area") or ""
+
+
+def _resolve_area_to_team(area_label: str, area_to_owner: dict) -> str:
+ """Prefix match: area/cs/compression -> area/cs (longest mapping key wins).
+
+ Mirrors the SQL logic: ``WHERE a.area = om.area OR StartsWith(a.area, om.area || '/')``
+ with ``ORDER BY LENGTH(om.area) DESC``.
+ """
+ best_team = ""
+ best_len = -1
+ for mapping_area, team in area_to_owner.items():
+ if area_label == mapping_area or area_label.startswith(mapping_area + '/'):
+ if len(mapping_area) > best_len:
+ best_team = team
+ best_len = len(mapping_area)
+ return best_team
+
+
+def resolve_area_override(body: str, info_raw, area_to_owner: dict):
+ """If GitHub ``area/...`` implies a different team than ``Owner:``, return that area path.
+
+ Stored value matches issue info (e.g. ``area/blobstorage``), not the resolved team slug.
+ """
+ area_label = (_extract_area_from_info(info_raw) or "").strip()
+ if not area_label:
+ return None
+
+ resolved_team = _resolve_area_to_team(area_label, area_to_owner)
+ if not resolved_team:
+ return None
+
+ default_owner = _extract_default_owner(body)
+
+ if resolved_team.lower() == default_owner.lower():
+ return None
+
+ return area_label
+
+
def create_test_issue_mapping_table(ydb_wrapper, table_path):
"""Create the test-to-issue mapping table"""
print(f"Creating test-to-issue mapping table: {table_path}")
-
+
create_table_sql = f"""
CREATE TABLE IF NOT EXISTS `{table_path}` (
`full_name` Utf8 NOT NULL,
@@ -62,6 +239,8 @@ def create_test_issue_mapping_table(ydb_wrapper, table_path):
`github_issue_number` Uint64 NOT NULL,
`github_issue_state` Utf8,
`github_issue_created_at` Timestamp,
+ `area_override` Utf8,
+ `area_override_since` Date,
PRIMARY KEY (full_name, branch, build_type, github_issue_number)
)
PARTITION BY HASH(full_name)
@@ -75,33 +254,40 @@ def create_test_issue_mapping_table(ydb_wrapper, table_path):
def convert_mapping_to_table_data(test_to_issue_mapping):
"""Convert the test-to-issue mapping to table data format"""
table_data = []
-
+
for test_name, issues in test_to_issue_mapping.items():
- if issues:
- # Sort issues by created_at (most recent first) and take the latest one
- sorted_issues = sorted(issues, key=lambda x: x.get('created_at', 0), reverse=True)
- latest_issue = sorted_issues[0]
-
- # Create a separate record for each branch of the latest issue
+ if not issues:
+ continue
+
+ # Group issues by build_type, then pick the latest per group
+ by_build_type = {}
+ for issue in issues:
+ bt = issue.get('build_type', DEFAULT_BUILD_TYPE)
+ existing = by_build_type.get(bt)
+ if existing is None or issue.get('created_at', 0) > existing.get('created_at', 0):
+ by_build_type[bt] = issue
+
+ for bt, latest_issue in by_build_type.items():
for branch in latest_issue['branches']:
table_data.append({
'full_name': test_name,
'branch': branch,
- 'build_type': latest_issue.get('build_type', 'relwithdebinfo'),
+ 'build_type': bt,
'github_issue_url': latest_issue['url'],
'github_issue_title': latest_issue['title'],
'github_issue_number': latest_issue['issue_number'],
'github_issue_state': latest_issue['state'],
'github_issue_created_at': latest_issue.get('created_at'),
+ 'area_override': latest_issue.get('area_override'),
})
-
+
return table_data
def bulk_upsert_mapping_data(ydb_wrapper, table_path, mapping_data):
"""Bulk upsert mapping data into the table"""
print(f"Bulk upserting {len(mapping_data)} test-to-issue mappings to {table_path}")
-
+
column_types = ydb.BulkUpsertColumns()
column_types.add_column('full_name', ydb.PrimitiveType.Utf8)
column_types.add_column('branch', ydb.PrimitiveType.Utf8)
@@ -111,7 +297,9 @@ def bulk_upsert_mapping_data(ydb_wrapper, table_path, mapping_data):
column_types.add_column('github_issue_number', ydb.PrimitiveType.Uint64)
column_types.add_column('github_issue_state', ydb.OptionalType(ydb.PrimitiveType.Utf8))
column_types.add_column('github_issue_created_at', ydb.OptionalType(ydb.PrimitiveType.Timestamp))
-
+ column_types.add_column('area_override', ydb.OptionalType(ydb.PrimitiveType.Utf8))
+ column_types.add_column('area_override_since', ydb.OptionalType(ydb.PrimitiveType.Date))
+
ydb_wrapper.bulk_upsert(table_path, mapping_data, column_types)
print(f"Bulk upsert completed")
@@ -120,49 +308,74 @@ def main():
"""Main function to create the test-to-issue mapping table"""
print("Starting GitHub issue mapping table creation")
script_start_time = time.time()
-
+
with YDBWrapper() as ydb_wrapper:
- # Check credentials
if not ydb_wrapper.check_credentials():
return 1
-
+
table_path = ydb_wrapper.get_table_path("github_issue_mapping")
-
+
try:
- # Get GitHub issues data
issues_data = get_github_issues_data(ydb_wrapper)
-
+
if not issues_data:
print("No GitHub issues data found")
return 0
-
- # Create test-to-issue mapping using shared utilities
+
+ area_to_owner = get_area_to_owner_mapping(ydb_wrapper)
+
+ url_to_updated_at = {}
+ # Pre-compute area_override per issue URL for O(1) lookup
+ url_to_area_override = {}
+ for issue in issues_data:
+ url = issue.get('url', '')
+ if url:
+ url_to_updated_at[url] = issue.get("updated_at") or issue.get("created_at")
+ url_to_area_override[url] = resolve_area_override(
+ issue.get('body', ''),
+ issue.get('info'),
+ area_to_owner,
+ )
+
print("Creating test-to-issue mapping...")
test_to_issue = create_test_issue_mapping(issues_data)
print(f"Created mapping for {len(test_to_issue)} unique test names")
-
- # Convert mapping to table data format
+
+ override_count = 0
+ for issue_list in test_to_issue.values():
+ for issue_info in issue_list:
+ ao = url_to_area_override.get(issue_info['url'])
+ issue_info['area_override'] = ao
+ if ao:
+ override_count += 1
+
+ if override_count:
+ print(f"Resolved {override_count} area_override(s) from area labels")
+
mapping_data = convert_mapping_to_table_data(test_to_issue)
print(f"Converted to {len(mapping_data)} table records")
-
- # Create mapping table (wrapper will add database_path automatically)
+
create_test_issue_mapping_table(ydb_wrapper, table_path)
-
- # Bulk upsert mapping data (wrapper will add database_path automatically)
+
+ existing_by_key = _fetch_existing_github_issue_mapping(ydb_wrapper, table_path)
+ if existing_by_key is None:
+ return 1
+ merge_area_override_since(mapping_data, existing_by_key, url_to_updated_at)
+
if mapping_data:
bulk_upsert_mapping_data(ydb_wrapper, table_path, mapping_data)
else:
print("No mapping data to insert")
-
+
script_elapsed = time.time() - script_start_time
print(f"Script completed successfully, total time: {script_elapsed:.2f}s")
-
+
except Exception as e:
print(f"Error during execution: {e}")
return 1
-
+
return 0
if __name__ == "__main__":
- exit(main()) \ No newline at end of file
+ exit(main())