diff options
author | t1mursadykov <t1mursadykov@ydb.tech> | 2022-11-23 15:06:06 +0300 |
---|---|---|
committer | t1mursadykov <t1mursadykov@ydb.tech> | 2022-11-23 15:06:06 +0300 |
commit | 837e447ad5eca521acde76ff230a27a8d406acaa (patch) | |
tree | b2ca76a819f43edfaba4a87b6ab9aaf070153ecf | |
parent | 98228b1c88aff21f1fe7e0ee2d2ed0a9c28cc781 (diff) | |
download | ydb-837e447ad5eca521acde76ff230a27a8d406acaa.tar.gz |
CMS functional tests
-rw-r--r-- | ydb/tests/functional/cms/test_cms_erasure.py | 85 | ||||
-rw-r--r-- | ydb/tests/functional/cms/test_cms_restart.py | 94 | ||||
-rw-r--r-- | ydb/tests/functional/cms/test_cms_state_storage.py | 70 | ||||
-rw-r--r-- | ydb/tests/functional/cms/utils.py | 18 | ||||
-rw-r--r-- | ydb/tests/library/common/cms.py | 57 | ||||
-rw-r--r-- | ydb/tests/library/common/protobuf_cms.py | 42 | ||||
-rw-r--r-- | ydb/tests/library/harness/kikimr_config.py | 24 | ||||
-rw-r--r-- | ydb/tests/library/harness/resources/default_domains.txt | 5 | ||||
-rw-r--r-- | ydb/tests/library/harness/resources/default_yaml.yml | 5 |
9 files changed, 387 insertions, 13 deletions
diff --git a/ydb/tests/functional/cms/test_cms_erasure.py b/ydb/tests/functional/cms/test_cms_erasure.py new file mode 100644 index 00000000000..4d18cca1819 --- /dev/null +++ b/ydb/tests/functional/cms/test_cms_erasure.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- +import logging +import time +from hamcrest import assert_that + +from ydb.core.protos.cms_pb2 import EAvailabilityMode + +from ydb.tests.library.common.types import Erasure +from ydb.tests.library.common.protobuf import KVRequest +import ydb.tests.library.common.cms as cms +from ydb.tests.library.harness.util import LogLevels +from ydb.tests.library.harness.kikimr_cluster import kikimr_cluster_factory +from ydb.tests.library.harness.kikimr_config import KikimrConfigGenerator +from ydb.tests.library.kv.helpers import create_tablets_and_wait_for_start +from ydb.tests.library.matchers.response import is_ok_response + +import utils + +logger = logging.getLogger(__name__) + + +class AbstractLocalClusterTest(object): + erasure = None + mode = None + + @classmethod + def setup_class(cls): + nodes_count = 8 if cls.erasure == Erasure.BLOCK_4_2 else 9 + configurator = KikimrConfigGenerator(cls.erasure, + nodes=nodes_count, + additional_log_configs={'CMS': LogLevels.DEBUG} + ) + cls.cluster = kikimr_cluster_factory(configurator=configurator) + cls.cluster.start() + # CMS will not let disable state storage + # nodes for first 2 minutes + time.sleep(120) + cms.request_increase_ratio_limit(cls.cluster.client) + + @classmethod + def teardown_class(cls): + cls.cluster.stop() + + +class AbstractTestCmsDegradedGroups(AbstractLocalClusterTest): + def test_no_degraded_groups_after_shutdown(self): + number_of_tablets = 10 + tablet_ids = create_tablets_and_wait_for_start( + self.cluster.client, number_of_tablets, + batch_size=number_of_tablets, + timeout_seconds=120 + ) + + allowed_hosts = cms.request_shutdown_nodes(self.cluster.client, self.cluster.nodes.keys(), type(self).mode) + for node in allowed_hosts: + self.cluster.nodes[node].stop() + + client = utils.create_client_from_alive_hosts(self.cluster, allowed_hosts) + # if there are no degraded groups + # then write returns ok + for tablet_id in tablet_ids: + resp = client.kv_request( + tablet_id, KVRequest().write(bytes("key", 'utf-8'), bytes(utils.value_for("key", tablet_id), 'utf-8')) + ) + assert_that(resp, is_ok_response()) + + +class TestDegradedGroupBlock42Max(AbstractTestCmsDegradedGroups): + erasure = Erasure.BLOCK_4_2 + mode = EAvailabilityMode.MODE_MAX_AVAILABILITY + + +class TestDegradedGroupBlock42Keep(AbstractTestCmsDegradedGroups): + erasure = Erasure.BLOCK_4_2 + mode = EAvailabilityMode.MODE_KEEP_AVAILABLE + + +class TestDegradedGroupMirror3dcMax(AbstractTestCmsDegradedGroups): + erasure = Erasure.MIRROR_3_DC + mode = EAvailabilityMode.MODE_MAX_AVAILABILITY + + +class TestDegradedGroupMirror3dcKeep(AbstractTestCmsDegradedGroups): + erasure = Erasure.MIRROR_3_DC + mode = EAvailabilityMode.MODE_KEEP_AVAILABLE diff --git a/ydb/tests/functional/cms/test_cms_restart.py b/ydb/tests/functional/cms/test_cms_restart.py new file mode 100644 index 00000000000..30db244eaed --- /dev/null +++ b/ydb/tests/functional/cms/test_cms_restart.py @@ -0,0 +1,94 @@ +import logging +import time + +from hamcrest import assert_that + +from ydb.core.protos.cms_pb2 import EAvailabilityMode + +from ydb.tests.library.common.types import Erasure +from ydb.tests.library.common.protobuf import KVRequest +import ydb.tests.library.common.cms as cms +from ydb.tests.library.harness.kikimr_config import KikimrConfigGenerator +from ydb.tests.library.harness.util import LogLevels +from ydb.tests.library.harness.kikimr_cluster import kikimr_cluster_factory +from ydb.tests.library.kv.helpers import create_tablets_and_wait_for_start +from ydb.tests.library.common.delayed import wait_tablets_are_active +from ydb.tests.library.matchers.response import is_ok_response + +import utils + +logger = logging.getLogger(__name__) + + +class AbstractLocalClusterTest(object): + erasure = None + mode = None + + @classmethod + def setup_class(cls): + nodes_count = 8 if cls.erasure == Erasure.BLOCK_4_2 else 9 + nodes_count *= 2 + configurator = KikimrConfigGenerator(cls.erasure, + nodes=nodes_count, + additional_log_configs={'CMS': LogLevels.DEBUG}, + ) + cls.cluster = kikimr_cluster_factory(configurator=configurator) + cls.cluster.start() + + time.sleep(120) + cms.request_increase_ratio_limit(cls.cluster.client) + + @classmethod + def teardown_class(cls): + cls.cluster.stop() + + +class AbstractTestCmsStateStorageRestarts(AbstractLocalClusterTest): + def test_restart_as_much_as_can(self): + number_of_tablets = 20 + tablet_ids = create_tablets_and_wait_for_start( + self.cluster.client, number_of_tablets, + batch_size=number_of_tablets, + timeout_seconds=120 + ) + + restart_nodes = cms.request_shutdown_as_much_as_possible(self.cluster.client, self.cluster.nodes.keys(), type(self).mode) + + for node in restart_nodes: + self.cluster.nodes[node].stop() + + client = utils.create_client_from_alive_hosts(self.cluster, restart_nodes) + + for tablet_id in tablet_ids: + client.tablet_kill(tablet_id) + + for tablet_id in tablet_ids: + resp = client.kv_request( + tablet_id, KVRequest().write(bytes("key", 'utf-8'), bytes(utils.value_for("key", tablet_id), 'utf-8')) + ) + assert_that(resp, is_ok_response()) + + for node in restart_nodes: + self.cluster.nodes[node].start() + + wait_tablets_are_active(self.cluster.client, tablet_ids) + + +class TestCmsStateStorageRestartsBlockMax(AbstractTestCmsStateStorageRestarts): + erasure = Erasure.BLOCK_4_2 + mode = EAvailabilityMode.MODE_MAX_AVAILABILITY + + +class TestCmsStateStorageRestartsBlockKeep(AbstractTestCmsStateStorageRestarts): + erasure = Erasure.BLOCK_4_2 + mode = EAvailabilityMode.MODE_KEEP_AVAILABLE + + +class TestCmsStateStorageRestartsMirrorMax(AbstractTestCmsStateStorageRestarts): + erasure = Erasure.MIRROR_3_DC + mode = EAvailabilityMode.MODE_MAX_AVAILABILITY + + +class TestCmsStateStorageRestartsMirrorKeep(AbstractTestCmsStateStorageRestarts): + erasure = Erasure.MIRROR_3_DC + mode = EAvailabilityMode.MODE_KEEP_AVAILABLE diff --git a/ydb/tests/functional/cms/test_cms_state_storage.py b/ydb/tests/functional/cms/test_cms_state_storage.py new file mode 100644 index 00000000000..75c2b922b89 --- /dev/null +++ b/ydb/tests/functional/cms/test_cms_state_storage.py @@ -0,0 +1,70 @@ +import logging +import time + +from ydb.core.protos.cms_pb2 import EAvailabilityMode + +from ydb.tests.library.common.types import Erasure +import ydb.tests.library.common.cms as cms +from ydb.tests.library.harness.kikimr_config import KikimrConfigGenerator +from ydb.tests.library.harness.util import LogLevels +from ydb.tests.library.harness.kikimr_cluster import kikimr_cluster_factory +from ydb.tests.library.kv.helpers import create_tablets_and_wait_for_start +from ydb.tests.library.common.delayed import wait_tablets_are_active + +import utils + +logger = logging.getLogger(__name__) + + +class AbstractLocalClusterTest(object): + erasure = None + mode = None + + @classmethod + def setup_class(cls): + configurator = KikimrConfigGenerator(Erasure.NONE, + nodes=27, + additional_log_configs={'CMS': LogLevels.DEBUG}, + state_storage_rings=[[n, n + 1, n + 2] for n in range(1, 27, 3)] + ) + cls.cluster = kikimr_cluster_factory(configurator=configurator) + cls.cluster.start() + + time.sleep(120) + cms.request_increase_ratio_limit(cls.cluster.client) + + @classmethod + def teardown_class(cls): + cls.cluster.stop() + + +class AbstractTestCmsStateStorageSimple(AbstractLocalClusterTest): + def test_check_shutdown_state_storage_nodes(self): + number_of_tablets = 10 + tablet_ids = create_tablets_and_wait_for_start( + self.cluster.client, number_of_tablets, + batch_size=number_of_tablets, + timeout_seconds=120 + ) + + allowed_hosts = cms.request_shutdown_nodes(self.cluster.client, + self.cluster.nodes.keys(), + type(self).mode) + + for node in allowed_hosts: + self.cluster.nodes[node].stop() + + client = utils.create_client_from_alive_hosts(self.cluster, allowed_hosts) + + for tablet_id in tablet_ids: + client.tablet_kill(tablet_id) + + wait_tablets_are_active(client, tablet_ids) + + +class TestCmsStateStorageSimpleKeep(AbstractTestCmsStateStorageSimple): + mode = EAvailabilityMode.MODE_KEEP_AVAILABLE + + +class TestCmsStateStorageSimpleMax(AbstractTestCmsStateStorageSimple): + mode = EAvailabilityMode.MODE_MAX_AVAILABILITY diff --git a/ydb/tests/functional/cms/utils.py b/ydb/tests/functional/cms/utils.py new file mode 100644 index 00000000000..5eacd735762 --- /dev/null +++ b/ydb/tests/functional/cms/utils.py @@ -0,0 +1,18 @@ +from ydb.tests.library.harness.kikimr_client import kikimr_client_factory + + +def value_for(key, tablet_id): + return "Value: <key = {key}, tablet_id = {tablet_id}>".format( + key=key, tablet_id=tablet_id) + + +def create_client_from_alive_hosts(cluster, alive_nodeids): + alive_node = cluster.nodes[1] + for node_id, node in cluster.nodes.items(): + if node_id not in alive_nodeids: + alive_node = node + break + assert alive_node not in alive_nodeids + client = kikimr_client_factory(alive_node.host, alive_node.grpc_port, retry_count=100) + + return client diff --git a/ydb/tests/library/common/cms.py b/ydb/tests/library/common/cms.py new file mode 100644 index 00000000000..1f1fa9b50e6 --- /dev/null +++ b/ydb/tests/library/common/cms.py @@ -0,0 +1,57 @@ +import logging +import copy + +import ydb.tests.library.common.protobuf_cms as cms_pb +from ydb.core.protos import cms_pb2 + +logger = logging.getLogger(__name__) + + +def request_increase_ratio_limit(client): + req = cms_pb.CmsConfigRequest() + client.send_request(req.protobuf, "CmsRequest") + + +def request_nodes(client, node_ids, mode, type): + req = cms_pb.CmsPermissionRequest() + for node_id in node_ids: + req.add_action(str(node_id), type) + req.set_mode(mode) + + logger.info("Sending permission request to CMS: %s", req.protobuf) + resp = client.send_request(req.protobuf, "CmsRequest") + logger.info("Got response from CMS: %s", resp.PermissionResponse) + + nodes = [] + for perm in resp.PermissionResponse.Permissions: + nodes.append(int(perm.Action.Host)) + + return nodes + + +def request_shutdown_nodes(client, node_ids, mode): + return request_nodes(client, node_ids, mode, cms_pb2.TAction.SHUTDOWN_HOST) + + +def request_restart_services(client, node_ids, mode): + return request_nodes(client, node_ids, mode, cms_pb2.TAction.RESTART_SERVICES) + + +def request_as_much_as_possible(client, node_ids, mode, type): + not_allowed = copy.deepcopy(list(node_ids)) + restart_nodes = [] + allowed_nodes = request_nodes(client, node_ids, mode, type) + + while len(allowed_nodes) > 0: + restart_nodes.extend(allowed_nodes) + + for node_id in allowed_nodes: + not_allowed.remove(node_id) + + allowed_nodes = request_nodes(client, not_allowed, mode, type) + + return restart_nodes + + +def request_shutdown_as_much_as_possible(client, node_ids, mode): + return request_as_much_as_possible(client, node_ids, mode, cms_pb2.TAction.SHUTDOWN_HOST) diff --git a/ydb/tests/library/common/protobuf_cms.py b/ydb/tests/library/common/protobuf_cms.py new file mode 100644 index 00000000000..f51590b45b8 --- /dev/null +++ b/ydb/tests/library/common/protobuf_cms.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from ydb.core.protos import cms_pb2 +import ydb.core.protos.msgbus_pb2 as msgbus +from ydb.tests.library.common.protobuf import AbstractProtobufBuilder + + +class CmsPermissionRequest(AbstractProtobufBuilder): + def __init__(self): + super(CmsPermissionRequest, self).__init__( + msgbus.TCmsRequest()) + + self.protobuf.PermissionRequest.User = "user" + self.protobuf.PermissionRequest.Schedule = False + self.protobuf.PermissionRequest.DryRun = False + self.protobuf.PermissionRequest.PartialPermissionAllowed = True + + def add_action(self, host, type): + action = self.protobuf.PermissionRequest.Actions.add() + action.Duration = 60000000 + action.Type = type + action.Host = host + if action == cms_pb2.TAction.RESTART_SERVICES: + action.Services = "storage" + + def set_mode(self, mode): + self.protobuf.PermissionRequest.AvailabilityMode = mode + + +class CmsConfigRequest(AbstractProtobufBuilder): + def __init__(self): + super(CmsConfigRequest, self).__init__(msgbus.TCmsRequest()) + + self.protobuf.SetConfigRequest.Config.ClusterLimits.DisabledNodesRatioLimit = 100 + self.protobuf.SetConfigRequest.Config.TenantLimits.DisabledNodesRatioLimit = 100 + + +class CmsStateRequest(AbstractProtobufBuilder): + def __init__(self): + super(CmsStateRequest, self).__init__(msgbus.TCmsRequest()) + + self.protobuf.ClusterStateRequest.CopyFrom(cms_pb2.TClusterStateRequest()) diff --git a/ydb/tests/library/harness/kikimr_config.py b/ydb/tests/library/harness/kikimr_config.py index f436bd14147..2ff49601b6d 100644 --- a/ydb/tests/library/harness/kikimr_config.py +++ b/ydb/tests/library/harness/kikimr_config.py @@ -6,6 +6,7 @@ import tempfile import socket import six import yaml +import copy from pkg_resources import resource_string from google.protobuf.text_format import Parse @@ -58,8 +59,7 @@ def get_grpc_host(): return "[::]" -def load_default_yaml(default_tablet_node_ids, ydb_domain_name, static_erasure, n_to_select, state_storage_nodes, - log_configs): +def load_default_yaml(default_tablet_node_ids, ydb_domain_name, static_erasure, log_configs): data = resource_string(__name__, "resources/default_yaml.yml") if isinstance(data, bytes): data = data.decode('utf-8') @@ -71,8 +71,6 @@ def load_default_yaml(default_tablet_node_ids, ydb_domain_name, static_erasure, ydb_default_log_level=int(LogLevels.from_string(os.getenv("YDB_DEFAULT_LOG_LEVEL", "NOTICE"))), ydb_domain_name=ydb_domain_name, ydb_static_erasure=static_erasure, - ydb_state_storage_n_to_select=n_to_select, - ydb_state_storage_nodes=state_storage_nodes, ydb_grpc_host=get_grpc_host(), ydb_pq_topics_are_first_class_citizen=bool(os.getenv("YDB_PQ_TOPICS_ARE_FIRST_CLASS_CITIZEN", "true")), ydb_pq_cluster_table_path=str(os.getenv("YDB_PQ_CLUSTER_TABLE_PATH", "")), @@ -120,6 +118,7 @@ class KikimrConfigGenerator(object): dynamic_pdisk_size=PDISK_SIZE, dynamic_pdisks=[], dynamic_storage_pools=[dict(name="dynamic_storage_pool:1", kind="hdd", pdisk_user_kind=0)], + state_storage_rings=None, n_to_select=None, use_log_files=True, grpc_ssl_enable=False, @@ -178,6 +177,9 @@ class KikimrConfigGenerator(object): self.n_to_select = 9 else: self.n_to_select = min(5, nodes) + self.state_storage_rings = state_storage_rings + if self.state_storage_rings is None: + self.state_storage_rings = copy.deepcopy(self.__node_ids[: 9 if erasure == Erasure.MIRROR_3_DC else 8]) self.__use_in_memory_pdisks = use_in_memory_pdisks or os.getenv('YDB_USE_IN_MEMORY_PDISKS') == 'true' self.__pdisks_directory = os.getenv('YDB_PDISKS_DIRECTORY') self.static_erasure = erasure @@ -205,8 +207,7 @@ class KikimrConfigGenerator(object): self.__bs_cache_file_path = bs_cache_file_path - self.yaml_config = load_default_yaml(self.__node_ids, self.domain_name, self.static_erasure, self.n_to_select, - self.__node_ids, self.__additional_log_configs) + self.yaml_config = load_default_yaml(self.__node_ids, self.domain_name, self.static_erasure, self.__additional_log_configs) self.yaml_config["feature_flags"]["enable_public_api_external_blobs"] = enable_public_api_external_blobs self.yaml_config["feature_flags"]["enable_mvcc"] = "VALUE_FALSE" if disable_mvcc else "VALUE_TRUE" self.yaml_config['pqconfig']['enabled'] = enable_pq @@ -447,6 +448,13 @@ class KikimrConfigGenerator(object): def all_node_ids(self): return self.__node_ids + def _add_state_storage_config(self): + self.yaml_config["domains_config"]["state_storage"] = [] + self.yaml_config["domains_config"]["state_storage"].append({"ssid" : 1, "ring" : {"nto_select" : self.n_to_select, "ring" : []}}) + + for ring in self.state_storage_rings: + self.yaml_config["domains_config"]["state_storage"][0]["ring"]["ring"].append({"node" : ring if isinstance(ring, list) else [ring], "use_ring_specific_node_selection" : True}) + def _add_pdisk_to_static_group(self, pdisk_id, path, node_id, pdisk_category, ring): domain_id = len( self.yaml_config['blob_storage_config']["service_set"]["groups"][0]["rings"][ring]["fail_domains"]) @@ -476,12 +484,14 @@ class KikimrConfigGenerator(object): self.yaml_config["blob_storage_config"]["service_set"]["pdisks"] = [] self.yaml_config["blob_storage_config"]["service_set"]["vdisks"] = [] self.yaml_config["blob_storage_config"]["service_set"]["groups"] = [ - {"group_id": 0, 'group_generation': 0, 'erasure_species': int(self.static_erasure)}] + {"group_id": 0, 'group_generation': 1, 'erasure_species': int(self.static_erasure)}] self.yaml_config["blob_storage_config"]["service_set"]["groups"][0]["rings"] = [] for dc in self._dcs: self.yaml_config["blob_storage_config"]["service_set"]["groups"][0]["rings"].append({"fail_domains": []}) + self._add_state_storage_config() + for node_id in self.__node_ids: datacenter_id = next(datacenter_id_generator) diff --git a/ydb/tests/library/harness/resources/default_domains.txt b/ydb/tests/library/harness/resources/default_domains.txt index 802acce0966..5c288afa983 100644 --- a/ydb/tests/library/harness/resources/default_domains.txt +++ b/ydb/tests/library/harness/resources/default_domains.txt @@ -70,7 +70,10 @@ SSId: 1 Ring { NToSelect: 1 - Node: 1 + Ring { + Node: 1 + UseRingSpecificNodeSelection: True + } } } HiveConfig { diff --git a/ydb/tests/library/harness/resources/default_yaml.yml b/ydb/tests/library/harness/resources/default_yaml.yml index 779f389e979..1901b1e2edd 100644 --- a/ydb/tests/library/harness/resources/default_yaml.yml +++ b/ydb/tests/library/harness/resources/default_yaml.yml @@ -35,11 +35,6 @@ channel_profile_config: pdisk_category: 0 storage_pool_kind: "hdd" domains_config: - state_storage: - - ssid: 1 - ring: - nto_select: {ydb_state_storage_n_to_select} - node: {ydb_state_storage_nodes} domain: - name: "{ydb_domain_name}" domain_id: 1 |