1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
|
#!/usr/bin/env python3
import argparse
import datetime
import os
import ydb
from ydb_wrapper import YDBWrapper
BASE_DATE = datetime.date(1970, 1, 1)
DEFAULT_MONTHS_BACK = 180 # 6 months
def parse_arguments():
"""Parse command line arguments."""
parser = argparse.ArgumentParser()
parser.add_argument('--build_type', default='relwithdebinfo', type=str, help='build types')
parser.add_argument('--branch', default='main', type=str, help='branch')
parser.add_argument('--start-date', dest='start_date', type=str, help='Start date (YYYY-MM-DD), inclusive')
parser.add_argument('--end-date', dest='end_date', type=str, help='End date (YYYY-MM-DD), inclusive')
args, unknown = parser.parse_known_args()
start_date_override = datetime.date.fromisoformat(args.start_date) if args.start_date else None
end_date_override = datetime.date.fromisoformat(args.end_date) if args.end_date else None
if start_date_override and end_date_override and start_date_override > end_date_override:
raise ValueError("start-date must be earlier or equal to end-date")
return {
'build_type': args.build_type,
'branch': args.branch,
'start_date_override': start_date_override,
'end_date_override': end_date_override,
}
def convert_ydb_date_to_python(date_value):
"""Convert YDB date (int or date) to Python date."""
if date_value is None:
return None
if isinstance(date_value, int):
return BASE_DATE + datetime.timedelta(days=date_value)
return date_value
def get_max_date_from_history(ydb_wrapper, flaky_tests_table, build_type, branch):
"""Get maximum date from flaky_tests_window table."""
query = f"""
select max(date_window) as max_date_window from `{flaky_tests_table}`
where build_type = '{build_type}' and branch = '{branch}'
"""
results = ydb_wrapper.execute_scan_query(query, query_name=f"get_last_date_from_history_{branch}")
max_date_window = results[0].get('max_date_window') if results[0] else None
return convert_ydb_date_to_python(max_date_window)
def get_min_date_from_test_runs(ydb_wrapper, test_runs_table, build_type, branch):
"""Get minimum date from test_runs table."""
query = f"""
select min(cast(run_timestamp as Date)) as min_run_date from `{test_runs_table}`
where build_type = '{build_type}' and branch = '{branch}'
"""
try:
results = ydb_wrapper.execute_scan_query(query, query_name=f"get_min_date_from_test_runs_{branch}")
min_run_date = results[0].get('min_run_date') if results[0] else None
return convert_ydb_date_to_python(min_run_date)
except Exception as e:
print(f'⚠️ Failed to get min date from test_runs_table: {e}')
return None
def determine_start_date(ydb_wrapper, test_runs_table, flaky_tests_table, build_type, branch,
start_date_override, end_date_override):
"""Determine the start date for processing history.
Logic:
1. If start_date_override is provided, use it (must be <= end_date)
2. If history exists, use max_date_window from history (but not after end_date)
3. If no history, check test_runs_table for min_run_date
4. Default to 6 months ago from end_date
"""
end_date = end_date_override if end_date_override else datetime.date.today()
default_start_date = end_date - datetime.timedelta(days=DEFAULT_MONTHS_BACK)
# If user explicitly provided start_date_override, use it
if start_date_override:
if start_date_override > end_date:
raise ValueError(f"start-date ({start_date_override}) must be earlier or equal to end-date ({end_date})")
return start_date_override, end_date
# Try to get max date from history
max_date_window = get_max_date_from_history(ydb_wrapper, flaky_tests_table, build_type, branch)
if max_date_window is not None:
# Clamp max_date_window to not exceed end_date before comparison
max_date_clamped = min(max_date_window, end_date)
# Use max_date_clamped if it's greater than default_start_date
if max_date_clamped > default_start_date:
start_date = max_date_clamped
else:
start_date = default_start_date
return start_date, end_date
# No history exists, check test_runs_table for min date
min_run_date = get_min_date_from_test_runs(ydb_wrapper, test_runs_table, build_type, branch)
if min_run_date is not None:
# Use the later of min_run_date and default_start_date
start_date = max(min_run_date, default_start_date)
# Ensure start_date doesn't exceed end_date
start_date = min(start_date, end_date)
print(f'📅 Found min date from test_runs_table: {min_run_date}, using: {start_date}')
else:
start_date = default_start_date
print(f'📅 No data in test_runs_table, using default: {start_date}')
return start_date, end_date
def build_history_query(date, test_runs_table, testowners_table, build_type, branch):
"""Build SQL query to get test history for a specific date."""
return f"""
select
full_name,
date_base,
history_list,
if(dist_hist = '','no_runs',dist_hist) as dist_hist,
suite_folder,
test_name,
build_type,
branch,
owners,
first_run,
last_run
from (
select
full_name,
date_base,
AGG_LIST(status) as history_list ,
String::JoinFromList( ListSort(AGG_LIST_DISTINCT(status)) ,',') as dist_hist,
suite_folder,
test_name,
owners,
build_type,
branch,
min(run_timestamp) as first_run,
max(run_timestamp) as last_run
from (
select * from (
select distinct
full_name,
suite_folder,
test_name,
owners,
Date('{date}') as date_base,
'{build_type}' as build_type,
'{branch}' as branch
from `{testowners_table}`
) as test_and_date
left JOIN (
select
suite_folder || '/' || test_name as full_name,
run_timestamp,
status
from `{test_runs_table}`
where
run_timestamp >= Date('{date}')
and run_timestamp < Date('{date}') + Interval("P1D")
and job_name in (
'Nightly-run',
'Regression-run',
'Regression-run_Large',
'Regression-run_Small_and_Medium',
'Regression-run_compatibility',
'Regression-whitelist-run',
'Postcommit_relwithdebinfo',
'Postcommit_asan'
)
and build_type = '{build_type}'
and branch = '{branch}'
and (pull IS NULL OR NOT String::Contains(pull, 'manual'))
order by full_name,run_timestamp desc
) as hist
ON test_and_date.full_name=hist.full_name
)
GROUP BY full_name,suite_folder,test_name,date_base,build_type,branch,owners
)
"""
def prepare_row_data(row):
"""Prepare a single row for database upsert."""
# Count status occurrences
history_list = list(row['history_list'])
status_counts = {status: history_list.count(status) for status in history_list}
return {
'suite_folder': row['suite_folder'],
'test_name': row['test_name'],
'full_name': row['full_name'],
'date_window': row['date_base'],
'days_ago_window': 1,
'build_type': row['build_type'],
'branch': row['branch'],
'owners': row.get('owners'),
'first_run': row['first_run'],
'last_run': row['last_run'],
'history': ','.join(history_list).encode('utf8'),
'history_class': row['dist_hist'],
'pass_count': status_counts.get('passed', 0),
'mute_count': status_counts.get('mute', 0),
'fail_count': status_counts.get('failure', 0),
'skip_count': status_counts.get('skipped', 0),
}
def get_column_types():
"""Get column types for bulk upsert."""
return (
ydb.BulkUpsertColumns()
.add_column("test_name", ydb.OptionalType(ydb.PrimitiveType.Utf8))
.add_column("suite_folder", ydb.OptionalType(ydb.PrimitiveType.Utf8))
.add_column("build_type", ydb.OptionalType(ydb.PrimitiveType.Utf8))
.add_column("branch", ydb.OptionalType(ydb.PrimitiveType.Utf8))
.add_column("owners", ydb.OptionalType(ydb.PrimitiveType.Utf8))
.add_column("first_run", ydb.OptionalType(ydb.PrimitiveType.Timestamp))
.add_column("last_run", ydb.OptionalType(ydb.PrimitiveType.Timestamp))
.add_column("full_name", ydb.OptionalType(ydb.PrimitiveType.Utf8))
.add_column("date_window", ydb.OptionalType(ydb.PrimitiveType.Date))
.add_column("days_ago_window", ydb.OptionalType(ydb.PrimitiveType.Uint64))
.add_column("history", ydb.OptionalType(ydb.PrimitiveType.String))
.add_column("history_class", ydb.OptionalType(ydb.PrimitiveType.String))
.add_column("pass_count", ydb.OptionalType(ydb.PrimitiveType.Uint64))
.add_column("mute_count", ydb.OptionalType(ydb.PrimitiveType.Uint64))
.add_column("fail_count", ydb.OptionalType(ydb.PrimitiveType.Uint64))
.add_column("skip_count", ydb.OptionalType(ydb.PrimitiveType.Uint64))
)
def process_date_range(ydb_wrapper, test_runs_table, testowners_table, flaky_tests_table,
build_type, branch, start_date, end_date):
"""Process all dates in the range and collect data for bulk upsert."""
date_list = [start_date + datetime.timedelta(days=x)
for x in range((end_date - start_date).days + 1)]
print(f'📊 Processing {len(date_list)} dates from {start_date} to {end_date}')
all_prepared_rows = []
for i, date in enumerate(sorted(date_list), 1):
print(f'📅 Processing date {i}/{len(date_list)}: {date}')
query = build_history_query(date, test_runs_table, testowners_table, build_type, branch)
results = ydb_wrapper.execute_scan_query(
query,
query_name=f"get_flaky_test_history_for_date_{branch}"
)
print(f'📈 History data captured, {len(results)} rows')
for row in results:
all_prepared_rows.append(prepare_row_data(row))
return all_prepared_rows
def main():
"""Main function."""
args = parse_arguments()
build_type = args['build_type']
branch = args['branch']
start_date_override = args['start_date_override']
end_date_override = args['end_date_override']
print(f'🚀 Starting flaky_tests_history.py')
print(f' 📅 Days window: 1')
print(f' 🔧 Build type: {build_type}')
print(f' 🌿 Branch: {branch}')
if start_date_override:
print(f' 📍 Start date override: {start_date_override}')
if end_date_override:
print(f' 📍 End date override: {end_date_override}')
with YDBWrapper() as ydb_wrapper:
# Get table paths from config
test_runs_table = ydb_wrapper.get_table_path("test_results")
testowners_table = ydb_wrapper.get_table_path("testowners")
flaky_tests_table = ydb_wrapper.get_table_path("flaky_tests_window")
try:
# Determine start and end dates
start_date, end_date = determine_start_date(
ydb_wrapper, test_runs_table, flaky_tests_table,
build_type, branch, start_date_override, end_date_override
)
# Final validation
if start_date > end_date:
raise ValueError(f"Start date ({start_date}) is after end date ({end_date}); nothing to process")
print(f'📅 Start history date: {start_date}')
print(f'📅 End history date: {end_date}')
# Create table if it doesn't exist
create_table_sql = f"""
CREATE table IF NOT EXISTS `{flaky_tests_table}` (
`test_name` Utf8 NOT NULL,
`suite_folder` Utf8 NOT NULL,
`full_name` Utf8 NOT NULL,
`date_window` Date NOT NULL,
`build_type` Utf8 NOT NULL,
`branch` Utf8 NOT NULL,
`first_run` Timestamp,
`last_run` Timestamp ,
`owners` Utf8 ,
`days_ago_window` Uint64 NOT NULL,
`history` String,
`history_class` String,
`pass_count` Uint64,
`mute_count` Uint64,
`fail_count` Uint64,
`skip_count` Uint64,
PRIMARY KEY (`test_name`, `suite_folder`, `full_name`,date_window, build_type, branch)
)
PARTITION BY HASH(`full_name`,build_type,branch)
WITH (STORE = COLUMN)
"""
ydb_wrapper.create_table(flaky_tests_table, create_table_sql)
# Process all dates and collect data
all_prepared_rows = process_date_range(
ydb_wrapper, test_runs_table, testowners_table, flaky_tests_table,
build_type, branch, start_date, end_date
)
# Insert all data in one batch
if all_prepared_rows:
print(f'💾 Upserting {len(all_prepared_rows)} rows of history data')
column_types = get_column_types()
ydb_wrapper.bulk_upsert_batches(
flaky_tests_table,
all_prepared_rows,
column_types,
batch_size=1000
)
print('✅ History updated successfully')
else:
print('ℹ️ No data to upload')
except Exception as e:
print(f'❌ Script failed: {e}')
raise
if __name__ == "__main__":
main()
|