diff options
author | serg-belyakov <serg-belyakov@yandex-team.com> | 2023-09-28 18:08:50 +0300 |
---|---|---|
committer | serg-belyakov <serg-belyakov@yandex-team.com> | 2023-09-28 18:48:30 +0300 |
commit | 887dfe66960e19ce9285f041a642b8d5c9a68525 (patch) | |
tree | 4621c2209ddc117eb883db2e3d40b0aabff798cc | |
parent | e1bf3a3e544c1895f0cbdca317f43740cc8cb6db (diff) | |
download | ydb-887dfe66960e19ce9285f041a642b8d5c9a68525.tar.gz |
Remove assert, check that target pdisk won't become overpopulated after reassign, don't count disks from static groups twice, KIKIMR-19458
Fix assert, check that target pdisk won't become overpopulated after reassign, KIKIMR-19458
-rw-r--r-- | ydb/apps/dstool/lib/common.py | 2 | ||||
-rw-r--r-- | ydb/apps/dstool/lib/dstool_cmd_cluster_balance.py | 39 |
2 files changed, 28 insertions, 13 deletions
diff --git a/ydb/apps/dstool/lib/common.py b/ydb/apps/dstool/lib/common.py index 419f1aab97..5181183852 100644 --- a/ydb/apps/dstool/lib/common.py +++ b/ydb/apps/dstool/lib/common.py @@ -670,6 +670,8 @@ def build_pdisk_usage_map(base_config, count_donors=False, storage_pool=None): pdisk_usage_map[pdisk_id] = pdisk.NumStaticSlots for vslot in base_config.VSlot: + if not (vslot.GroupId & 0x80000000): # don't count vslots from static groups twice + continue pdisk_id = get_pdisk_id(vslot.VSlotId) if pdisk_id not in pdisk_usage_map: continue diff --git a/ydb/apps/dstool/lib/dstool_cmd_cluster_balance.py b/ydb/apps/dstool/lib/dstool_cmd_cluster_balance.py index 46aef7dd52..d4f01e64a5 100644 --- a/ydb/apps/dstool/lib/dstool_cmd_cluster_balance.py +++ b/ydb/apps/dstool/lib/dstool_cmd_cluster_balance.py @@ -21,7 +21,8 @@ def do(args): node_mon_map = common.fetch_node_mon_map({vslot.VSlotId.NodeId for vslot in base_config.VSlot}) vslot_map = common.build_vslot_map(base_config) pdisk_map = common.build_pdisk_map(base_config) - pdisk_usage = common.build_pdisk_usage_map(base_config) + pdisk_usage = common.build_pdisk_usage_map(base_config, count_donors=False) + pdisk_usage_w_donors = common.build_pdisk_usage_map(base_config, count_donors=True) vdisks_groups_count_map = defaultdict(int) for group in base_config.Group: @@ -52,7 +53,7 @@ def do(args): if unhealthy_groups: common.print_if_verbose(args, 'Skipping vdisks from unhealthy groups: %s' % (unhealthy_groups), file=sys.stdout) - healty_vslots = [ + healthy_vslots = [ vslot for vslot in base_config.VSlot if vslot.GroupId in healthy_groups @@ -69,7 +70,7 @@ def do(args): common.print_status(args, success=True, error_reason='') break - healty_vslots_from_overpopulated_pdisks = [] + healthy_vslots_from_overpopulated_pdisks = [] for vslot in base_config.VSlot: pdisk_id = common.get_pdisk_id(vslot.VSlotId) if pdisk_id not in overpopulated_pdisks: @@ -77,15 +78,15 @@ def do(args): if vslot.GroupId not in healthy_groups: continue - healty_vslots_from_overpopulated_pdisks.append(vslot) + healthy_vslots_from_overpopulated_pdisks.append(vslot) candidate_vslots = [] - if healty_vslots_from_overpopulated_pdisks: - common.print_if_not_quiet(args, f'Found {len(healty_vslots_from_overpopulated_pdisks)} vdisks from overpopulated pdisks', sys.stdout) - candidate_vslots = healty_vslots_from_overpopulated_pdisks - elif healty_vslots and not args.only_from_overpopulated_pdisks: - common.print_if_not_quiet(args, f'Found {len(healty_vslots)} vdisks suitable for relocation', sys.stdout) - candidate_vslots = healty_vslots + if healthy_vslots_from_overpopulated_pdisks: + common.print_if_not_quiet(args, f'Found {len(healthy_vslots_from_overpopulated_pdisks)} vdisks from overpopulated pdisks', sys.stdout) + candidate_vslots = healthy_vslots_from_overpopulated_pdisks + elif healthy_vslots and not args.only_from_overpopulated_pdisks: + common.print_if_not_quiet(args, f'Found {len(healthy_vslots)} vdisks suitable for relocation', sys.stdout) + candidate_vslots = healthy_vslots else: # candidate_vslots is empty common.print_if_not_quiet(args, 'No vdisks suitable for relocation found, waiting..', sys.stdout) time.sleep(10) @@ -101,7 +102,7 @@ def do(args): common.print_if_verbose(args, 'Checking to relocate vdisk from vslot %s on pdisk %s with slot usage %d' % (vslot_id, pdisk_id, pdisk_usage[pdisk_id]), file=sys.stdout) current_usage = pdisk_usage[pdisk_id] - if not healty_vslots_from_overpopulated_pdisks: + if not healthy_vslots_from_overpopulated_pdisks: for i in range(0, current_usage - 1): if histo[i]: break @@ -132,13 +133,25 @@ def do(args): pdisk_from = item.From.NodeId, item.From.PDiskId pdisk_to = item.To.NodeId, item.To.PDiskId if pdisk_usage[pdisk_to] + 1 > pdisk_usage[pdisk_from] - 1: - assert not healty_vslots_from_overpopulated_pdisks + if pdisk_usage_w_donors[pdisk_to] + 1 > pdisk_map[pdisk_to].ExpectedSlotCount: + common.print_if_not_quiet( + args, + 'NOTICE: Attempted to reassign vdisk from pdisk [%d:%d] to pdisk [%d:%d] with slot usage %d and slot limit %d on latter', + *pdisk_from, *pdisk_to, pdisk_usage_w_donors[pdisk_to], pdisk_map[pdisk_to].ExpectedSlotCount) + return False + if not try_blocking: return False request = common.kikimr_bsconfig.TConfigRequest(Rollback=True) inactive = [] for pdisk in base_config.PDisk: - if pdisk_usage[common.get_pdisk_id(pdisk)] + 1 > pdisk_usage[pdisk_id] - 1: + disk_is_better = True + check_pdisk_id = common.get_pdisk_id(pdisk) + if not healthy_vslots_from_overpopulated_pdisks and pdisk_usage_w_donors[check_pdisk_id] + 1 > pdisk_usage_w_donors[pdisk_id] - 1: + disk_is_better = False + if healthy_vslots_from_overpopulated_pdisks and pdisk_usage_w_donors[check_pdisk_id] + 1 > pdisk_map[pdisk_to].ExpectedSlotCount: + disk_is_better = False + if not disk_is_better: add_update_drive_status(request, pdisk, common.kikimr_bsconfig.EDriveStatus.INACTIVE) inactive.append(pdisk) index = len(request.Command) |