diff options
author | Kirill Rysin <35688753+naspirato@users.noreply.github.com> | 2024-09-19 19:15:30 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-09-19 19:15:30 +0200 |
commit | 9e8c4c6cdd8071acb00cb86361335c698434e4e5 (patch) | |
tree | 9dc9ab71141b7a666be17d3442d6600f056c8c61 | |
parent | 60dc4dfc59ffcf6973eb1151f80762ad3d1af7b1 (diff) | |
download | ydb-9e8c4c6cdd8071acb00cb86361335c698434e4e5.tar.gz |
Fix bulk upsert rewrite results asan/relwithdevinfo in analytics (#9521)
-rwxr-xr-x | .github/scripts/analytics/flaky_tests_history.py | 201 | ||||
-rwxr-xr-x | .github/scripts/analytics/flaky_tests_history_n_runs.py | 1 | ||||
-rw-r--r-- | .github/workflows/collect_analytics.yml | 4 |
3 files changed, 110 insertions, 96 deletions
diff --git a/.github/scripts/analytics/flaky_tests_history.py b/.github/scripts/analytics/flaky_tests_history.py index 95524930ed..4d0782a7e9 100755 --- a/.github/scripts/analytics/flaky_tests_history.py +++ b/.github/scripts/analytics/flaky_tests_history.py @@ -32,6 +32,9 @@ def create_tables(pool, table_path): `date_window` Date NOT NULL, `build_type` Utf8 NOT NULL, `branch` Utf8 NOT NULL, + `first_run` Timestamp, + `last_run` Timestamp , + `owners` Utf8 , `days_ago_window` Uint64 NOT NULL, `history` String, `history_class` String, @@ -39,9 +42,9 @@ def create_tables(pool, table_path): `mute_count` Uint64, `fail_count` Uint64, `skip_count` Uint64, - PRIMARY KEY (`test_name`, `suite_folder`, `full_name`,date_window) + PRIMARY KEY (`test_name`, `suite_folder`, `full_name`,date_window, build_type, branch) ) - PARTITION BY HASH(`full_name`) + PARTITION BY HASH(`full_name`,build_type,branch) WITH (STORE = COLUMN) """) @@ -56,6 +59,8 @@ def bulk_upsert(table_client, table_path, rows): .add_column("suite_folder", ydb.OptionalType(ydb.PrimitiveType.Utf8)) .add_column("build_type", ydb.OptionalType(ydb.PrimitiveType.Utf8)) .add_column("branch", ydb.OptionalType(ydb.PrimitiveType.Utf8)) + .add_column("first_run", ydb.OptionalType(ydb.PrimitiveType.Timestamp)) + .add_column("last_run", ydb.OptionalType(ydb.PrimitiveType.Timestamp)) .add_column("full_name", ydb.OptionalType(ydb.PrimitiveType.Utf8)) .add_column("date_window", ydb.OptionalType(ydb.PrimitiveType.Date)) .add_column("days_ago_window", ydb.OptionalType(ydb.PrimitiveType.Uint64)) @@ -71,7 +76,8 @@ def bulk_upsert(table_client, table_path, rows): def main(): parser = argparse.ArgumentParser() - parser.add_argument('--days-window', default=5, type=int, help='how many days back we collecting history') + + parser.add_argument('--days-window', default=1, type=int, help='how many days back we collecting history') parser.add_argument('--build_type',choices=['relwithdebinfo', 'release-asan'], default='relwithdebinfo', type=str, help='build : relwithdebinfo or release-asan') parser.add_argument('--branch', default='main',choices=['main'], type=str, help='branch') @@ -108,14 +114,15 @@ def main(): tc_settings = ydb.TableClientSettings().with_native_date_in_result_sets(enabled=True) table_client = ydb.TableClient(driver, tc_settings) - table_path = f'test_results/analytics/flaky_tests_history_{history_for_n_day}_days' - default_start_date = datetime.date(2024, 7, 1) + table_path = f'test_results/analytics/flaky_tests_window_{history_for_n_day}_days' + default_start_date = datetime.date(2024, 9, 1) with ydb.SessionPool(driver) as pool: create_tables(pool, table_path) # geting last date from history - last_date_query = f"select max(date_window) as max_date_window from `{table_path}`" + last_date_query = f"""select max(date_window) as max_date_window from `{table_path}` + where build_type = '{build_type}' and branch = '{branch}'""" query = ydb.ScanQuery(last_date_query, {}) it = table_client.scan_query(query) results = [] @@ -126,119 +133,129 @@ def main(): except StopIteration: break - if results[0] and results[0].get( 'max_date_window', default_start_date) is not None: - last_date = results[0].get( - 'max_date_window', default_start_date).strftime('%Y-%m-%d') + if results[0] and results[0].get( 'max_date_window', default_start_date) is not None and results[0].get( 'max_date_window', default_start_date) > default_start_date: + last_datetime = results[0].get( + 'max_date_window', default_start_date) + else: - last_date = default_start_date.strftime('%Y-%m-%d') + last_datetime = default_start_date + + last_date = last_datetime.strftime('%Y-%m-%d') print(f'last hisotry date: {last_date}') # getting history for dates >= last_date - query_get_history = f""" - select - full_name, - date_base, - history_list, - dist_hist, - suite_folder, - test_name, - '{build_type}' as build_type, - '{branch}' as branch - from ( + + today = datetime.date.today() + date_list = [today - datetime.timedelta(days=x) for x in range((today - last_datetime).days+1)] + for date in sorted(date_list): + query_get_history = f""" + select full_name, date_base, - AGG_LIST(status) as history_list , - String::JoinFromList( AGG_LIST_DISTINCT(status) ,',') as dist_hist, + history_list, + if(dist_hist = '','no_runs',dist_hist) as dist_hist, suite_folder, - test_name + test_name, + build_type, + branch, + owners, + first_run, + last_run + from ( - select * from ( + select + full_name, + date_base, + AGG_LIST(status) as history_list , + String::JoinFromList( ListSort(AGG_LIST_DISTINCT(status)) ,',') as dist_hist, + suite_folder, + test_name, + owners, + build_type, + branch, + min(run_timestamp) as first_run, + max(run_timestamp) as last_run + from ( select * from ( - select DISTINCT + + select distinct full_name, suite_folder, - test_name + test_name, + owners, + Date('{date}') as date_base, + '{build_type}' as build_type, + '{branch}' as branch from `test_results/analytics/testowners` - where run_timestamp_last >= Date('{last_date}') - 3*Interval("P1D") - ) as all_tests - cross join ( - select - DISTINCT DateTime::MakeDate(run_timestamp) as date_base - from `test_results/test_runs_column` - where - (job_name ='Nightly-run' or job_name ='Postcommit_relwithdebinfo' or job_name ='Postcommit_asan') - and run_timestamp>= Date('{last_date}') - ) as date_list ) as test_and_date - left JOIN ( - select * from ( + left JOIN ( + select suite_folder || '/' || test_name as full_name, run_timestamp, status from `test_results/test_runs_column` where - run_timestamp >= Date('{last_date}') -{history_for_n_day}*Interval("P1D") + run_timestamp <= Date('{date}') + Interval("P1D") + and run_timestamp >= Date('{date}') - {history_for_n_day}*Interval("P1D") + and (job_name ='Nightly-run' or job_name ='Postcommit_relwithdebinfo' or job_name ='Postcommit_asan') and build_type = '{build_type}' and branch = '{branch}' order by full_name,run_timestamp desc - ) - ) as hist - ON test_and_date.full_name=hist.full_name - where - hist.run_timestamp >= test_and_date.date_base -{history_for_n_day}*Interval("P1D") AND - hist.run_timestamp < test_and_date.date_base + Interval("P1D") - + + ) as hist + ON test_and_date.full_name=hist.full_name + ) + GROUP BY full_name,suite_folder,test_name,date_base,build_type,branch,owners ) - GROUP BY full_name,suite_folder,test_name,date_base - - ) - """ - query = ydb.ScanQuery(query_get_history, {}) - # start transaction time - start_time = time.time() - it = driver.table_client.scan_query(query) - # end transaction time + """ + query = ydb.ScanQuery(query_get_history, {}) + # start transaction time + start_time = time.time() + it = driver.table_client.scan_query(query) + # end transaction time - results = [] - prepared_for_update_rows = [] - while True: - try: - result = next(it) - results = results + result.result_set.rows - except StopIteration: - break - end_time = time.time() - print(f'transaction duration: {end_time - start_time}') - - print(f'history data captured, {len(results)} rows') - for row in results: - row['count'] = dict(zip(list(row['history_list']), [list( - row['history_list']).count(i) for i in list(row['history_list'])])) - prepared_for_update_rows.append({ - 'suite_folder': row['suite_folder'], - 'test_name': row['test_name'], - 'full_name': row['full_name'], - 'date_window': row['date_base'], - 'days_ago_window': history_for_n_day, - 'build_type': row['build_type'], - 'branch': row['branch'], - 'history': ','.join(row['history_list']).encode('utf8'), - 'history_class': row['dist_hist'], - 'pass_count': row['count'].get('passed', 0), - 'mute_count': row['count'].get('mute', 0), - 'fail_count': row['count'].get('failure', 0), - 'skip_count': row['count'].get('skipped', 0), - }) - print('upserting history') - with ydb.SessionPool(driver) as pool: + results = [] + prepared_for_update_rows = [] + while True: + try: + result = next(it) + results = results + result.result_set.rows + except StopIteration: + break + end_time = time.time() + print(f'transaction duration: {end_time - start_time}') - create_tables(pool, table_path) - full_path = posixpath.join(DATABASE_PATH, table_path) - bulk_upsert(driver.table_client, full_path, - prepared_for_update_rows) + print(f'history data captured, {len(results)} rows') + for row in results: + row['count'] = dict(zip(list(row['history_list']), [list( + row['history_list']).count(i) for i in list(row['history_list'])])) + prepared_for_update_rows.append({ + 'suite_folder': row['suite_folder'], + 'test_name': row['test_name'], + 'full_name': row['full_name'], + 'date_window': row['date_base'], + 'days_ago_window': history_for_n_day, + 'build_type': row['build_type'], + 'branch': row['branch'], + 'first_run': row['first_run'], + 'last_run': row['last_run'], + 'history': ','.join(row['history_list']).encode('utf8'), + 'history_class': row['dist_hist'], + 'pass_count': row['count'].get('passed', 0), + 'mute_count': row['count'].get('mute', 0), + 'fail_count': row['count'].get('failure', 0), + 'skip_count': row['count'].get('skipped', 0), + }) + print(f'upserting history for date {date}') + with ydb.SessionPool(driver) as pool: + + create_tables(pool, table_path) + full_path = posixpath.join(DATABASE_PATH, table_path) + bulk_upsert(driver.table_client, full_path, + prepared_for_update_rows) print('history updated') diff --git a/.github/scripts/analytics/flaky_tests_history_n_runs.py b/.github/scripts/analytics/flaky_tests_history_n_runs.py index 49ced5cb76..d6e92232d8 100755 --- a/.github/scripts/analytics/flaky_tests_history_n_runs.py +++ b/.github/scripts/analytics/flaky_tests_history_n_runs.py @@ -185,6 +185,7 @@ def main(): '{branch}' as branch from `test_results/analytics/testowners` as t1 where run_timestamp_last >= Date('{date}') - 3*Interval("P1D") + and run_timestamp_last <= Date('{date}') + Interval("P1D") ) as test_and_date left JOIN ( select * from ( diff --git a/.github/workflows/collect_analytics.yml b/.github/workflows/collect_analytics.yml index c53f7aca7a..da1345daab 100644 --- a/.github/workflows/collect_analytics.yml +++ b/.github/workflows/collect_analytics.yml @@ -29,10 +29,6 @@ jobs: python3 -m pip install ydb ydb[yc] codeowners - name: Collect testowners run: python3 .github/scripts/analytics/upload_testowners.py - - name: Collect test history data with window 5 days relwithdebinfo for main - run: python3 .github/scripts/analytics/flaky_tests_history.py --days-window=5 - - name: Collect test history data with window 5 days release-asan for main - run: python3 .github/scripts/analytics/flaky_tests_history.py --days-window=5 --build_type=release-asan - name: Collect test history data with window 1 days relwithdebinfo for main run: python3 .github/scripts/analytics/flaky_tests_history.py --days-window=1 - name: Collect test history data with window 1 days release-asan for main |