aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorserg-belyakov <serg-belyakov@yandex-team.com>2023-09-28 18:08:50 +0300
committerserg-belyakov <serg-belyakov@yandex-team.com>2023-09-28 18:48:30 +0300
commit887dfe66960e19ce9285f041a642b8d5c9a68525 (patch)
tree4621c2209ddc117eb883db2e3d40b0aabff798cc
parente1bf3a3e544c1895f0cbdca317f43740cc8cb6db (diff)
downloadydb-887dfe66960e19ce9285f041a642b8d5c9a68525.tar.gz
Remove assert, check that target pdisk won't become overpopulated after reassign, don't count disks from static groups twice, KIKIMR-19458
Fix assert, check that target pdisk won't become overpopulated after reassign, KIKIMR-19458
-rw-r--r--ydb/apps/dstool/lib/common.py2
-rw-r--r--ydb/apps/dstool/lib/dstool_cmd_cluster_balance.py39
2 files changed, 28 insertions, 13 deletions
diff --git a/ydb/apps/dstool/lib/common.py b/ydb/apps/dstool/lib/common.py
index 419f1aab97..5181183852 100644
--- a/ydb/apps/dstool/lib/common.py
+++ b/ydb/apps/dstool/lib/common.py
@@ -670,6 +670,8 @@ def build_pdisk_usage_map(base_config, count_donors=False, storage_pool=None):
pdisk_usage_map[pdisk_id] = pdisk.NumStaticSlots
for vslot in base_config.VSlot:
+ if not (vslot.GroupId & 0x80000000): # don't count vslots from static groups twice
+ continue
pdisk_id = get_pdisk_id(vslot.VSlotId)
if pdisk_id not in pdisk_usage_map:
continue
diff --git a/ydb/apps/dstool/lib/dstool_cmd_cluster_balance.py b/ydb/apps/dstool/lib/dstool_cmd_cluster_balance.py
index 46aef7dd52..d4f01e64a5 100644
--- a/ydb/apps/dstool/lib/dstool_cmd_cluster_balance.py
+++ b/ydb/apps/dstool/lib/dstool_cmd_cluster_balance.py
@@ -21,7 +21,8 @@ def do(args):
node_mon_map = common.fetch_node_mon_map({vslot.VSlotId.NodeId for vslot in base_config.VSlot})
vslot_map = common.build_vslot_map(base_config)
pdisk_map = common.build_pdisk_map(base_config)
- pdisk_usage = common.build_pdisk_usage_map(base_config)
+ pdisk_usage = common.build_pdisk_usage_map(base_config, count_donors=False)
+ pdisk_usage_w_donors = common.build_pdisk_usage_map(base_config, count_donors=True)
vdisks_groups_count_map = defaultdict(int)
for group in base_config.Group:
@@ -52,7 +53,7 @@ def do(args):
if unhealthy_groups:
common.print_if_verbose(args, 'Skipping vdisks from unhealthy groups: %s' % (unhealthy_groups), file=sys.stdout)
- healty_vslots = [
+ healthy_vslots = [
vslot
for vslot in base_config.VSlot
if vslot.GroupId in healthy_groups
@@ -69,7 +70,7 @@ def do(args):
common.print_status(args, success=True, error_reason='')
break
- healty_vslots_from_overpopulated_pdisks = []
+ healthy_vslots_from_overpopulated_pdisks = []
for vslot in base_config.VSlot:
pdisk_id = common.get_pdisk_id(vslot.VSlotId)
if pdisk_id not in overpopulated_pdisks:
@@ -77,15 +78,15 @@ def do(args):
if vslot.GroupId not in healthy_groups:
continue
- healty_vslots_from_overpopulated_pdisks.append(vslot)
+ healthy_vslots_from_overpopulated_pdisks.append(vslot)
candidate_vslots = []
- if healty_vslots_from_overpopulated_pdisks:
- common.print_if_not_quiet(args, f'Found {len(healty_vslots_from_overpopulated_pdisks)} vdisks from overpopulated pdisks', sys.stdout)
- candidate_vslots = healty_vslots_from_overpopulated_pdisks
- elif healty_vslots and not args.only_from_overpopulated_pdisks:
- common.print_if_not_quiet(args, f'Found {len(healty_vslots)} vdisks suitable for relocation', sys.stdout)
- candidate_vslots = healty_vslots
+ if healthy_vslots_from_overpopulated_pdisks:
+ common.print_if_not_quiet(args, f'Found {len(healthy_vslots_from_overpopulated_pdisks)} vdisks from overpopulated pdisks', sys.stdout)
+ candidate_vslots = healthy_vslots_from_overpopulated_pdisks
+ elif healthy_vslots and not args.only_from_overpopulated_pdisks:
+ common.print_if_not_quiet(args, f'Found {len(healthy_vslots)} vdisks suitable for relocation', sys.stdout)
+ candidate_vslots = healthy_vslots
else: # candidate_vslots is empty
common.print_if_not_quiet(args, 'No vdisks suitable for relocation found, waiting..', sys.stdout)
time.sleep(10)
@@ -101,7 +102,7 @@ def do(args):
common.print_if_verbose(args, 'Checking to relocate vdisk from vslot %s on pdisk %s with slot usage %d' % (vslot_id, pdisk_id, pdisk_usage[pdisk_id]), file=sys.stdout)
current_usage = pdisk_usage[pdisk_id]
- if not healty_vslots_from_overpopulated_pdisks:
+ if not healthy_vslots_from_overpopulated_pdisks:
for i in range(0, current_usage - 1):
if histo[i]:
break
@@ -132,13 +133,25 @@ def do(args):
pdisk_from = item.From.NodeId, item.From.PDiskId
pdisk_to = item.To.NodeId, item.To.PDiskId
if pdisk_usage[pdisk_to] + 1 > pdisk_usage[pdisk_from] - 1:
- assert not healty_vslots_from_overpopulated_pdisks
+ if pdisk_usage_w_donors[pdisk_to] + 1 > pdisk_map[pdisk_to].ExpectedSlotCount:
+ common.print_if_not_quiet(
+ args,
+ 'NOTICE: Attempted to reassign vdisk from pdisk [%d:%d] to pdisk [%d:%d] with slot usage %d and slot limit %d on latter',
+ *pdisk_from, *pdisk_to, pdisk_usage_w_donors[pdisk_to], pdisk_map[pdisk_to].ExpectedSlotCount)
+ return False
+
if not try_blocking:
return False
request = common.kikimr_bsconfig.TConfigRequest(Rollback=True)
inactive = []
for pdisk in base_config.PDisk:
- if pdisk_usage[common.get_pdisk_id(pdisk)] + 1 > pdisk_usage[pdisk_id] - 1:
+ disk_is_better = True
+ check_pdisk_id = common.get_pdisk_id(pdisk)
+ if not healthy_vslots_from_overpopulated_pdisks and pdisk_usage_w_donors[check_pdisk_id] + 1 > pdisk_usage_w_donors[pdisk_id] - 1:
+ disk_is_better = False
+ if healthy_vslots_from_overpopulated_pdisks and pdisk_usage_w_donors[check_pdisk_id] + 1 > pdisk_map[pdisk_to].ExpectedSlotCount:
+ disk_is_better = False
+ if not disk_is_better:
add_update_drive_status(request, pdisk, common.kikimr_bsconfig.EDriveStatus.INACTIVE)
inactive.append(pdisk)
index = len(request.Command)