aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authort1mursadykov <t1mursadykov@ydb.tech>2022-11-23 15:06:06 +0300
committert1mursadykov <t1mursadykov@ydb.tech>2022-11-23 15:06:06 +0300
commit837e447ad5eca521acde76ff230a27a8d406acaa (patch)
treeb2ca76a819f43edfaba4a87b6ab9aaf070153ecf
parent98228b1c88aff21f1fe7e0ee2d2ed0a9c28cc781 (diff)
downloadydb-837e447ad5eca521acde76ff230a27a8d406acaa.tar.gz
CMS functional tests
-rw-r--r--ydb/tests/functional/cms/test_cms_erasure.py85
-rw-r--r--ydb/tests/functional/cms/test_cms_restart.py94
-rw-r--r--ydb/tests/functional/cms/test_cms_state_storage.py70
-rw-r--r--ydb/tests/functional/cms/utils.py18
-rw-r--r--ydb/tests/library/common/cms.py57
-rw-r--r--ydb/tests/library/common/protobuf_cms.py42
-rw-r--r--ydb/tests/library/harness/kikimr_config.py24
-rw-r--r--ydb/tests/library/harness/resources/default_domains.txt5
-rw-r--r--ydb/tests/library/harness/resources/default_yaml.yml5
9 files changed, 387 insertions, 13 deletions
diff --git a/ydb/tests/functional/cms/test_cms_erasure.py b/ydb/tests/functional/cms/test_cms_erasure.py
new file mode 100644
index 00000000000..4d18cca1819
--- /dev/null
+++ b/ydb/tests/functional/cms/test_cms_erasure.py
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+import logging
+import time
+from hamcrest import assert_that
+
+from ydb.core.protos.cms_pb2 import EAvailabilityMode
+
+from ydb.tests.library.common.types import Erasure
+from ydb.tests.library.common.protobuf import KVRequest
+import ydb.tests.library.common.cms as cms
+from ydb.tests.library.harness.util import LogLevels
+from ydb.tests.library.harness.kikimr_cluster import kikimr_cluster_factory
+from ydb.tests.library.harness.kikimr_config import KikimrConfigGenerator
+from ydb.tests.library.kv.helpers import create_tablets_and_wait_for_start
+from ydb.tests.library.matchers.response import is_ok_response
+
+import utils
+
+logger = logging.getLogger(__name__)
+
+
+class AbstractLocalClusterTest(object):
+ erasure = None
+ mode = None
+
+ @classmethod
+ def setup_class(cls):
+ nodes_count = 8 if cls.erasure == Erasure.BLOCK_4_2 else 9
+ configurator = KikimrConfigGenerator(cls.erasure,
+ nodes=nodes_count,
+ additional_log_configs={'CMS': LogLevels.DEBUG}
+ )
+ cls.cluster = kikimr_cluster_factory(configurator=configurator)
+ cls.cluster.start()
+ # CMS will not let disable state storage
+ # nodes for first 2 minutes
+ time.sleep(120)
+ cms.request_increase_ratio_limit(cls.cluster.client)
+
+ @classmethod
+ def teardown_class(cls):
+ cls.cluster.stop()
+
+
+class AbstractTestCmsDegradedGroups(AbstractLocalClusterTest):
+ def test_no_degraded_groups_after_shutdown(self):
+ number_of_tablets = 10
+ tablet_ids = create_tablets_and_wait_for_start(
+ self.cluster.client, number_of_tablets,
+ batch_size=number_of_tablets,
+ timeout_seconds=120
+ )
+
+ allowed_hosts = cms.request_shutdown_nodes(self.cluster.client, self.cluster.nodes.keys(), type(self).mode)
+ for node in allowed_hosts:
+ self.cluster.nodes[node].stop()
+
+ client = utils.create_client_from_alive_hosts(self.cluster, allowed_hosts)
+ # if there are no degraded groups
+ # then write returns ok
+ for tablet_id in tablet_ids:
+ resp = client.kv_request(
+ tablet_id, KVRequest().write(bytes("key", 'utf-8'), bytes(utils.value_for("key", tablet_id), 'utf-8'))
+ )
+ assert_that(resp, is_ok_response())
+
+
+class TestDegradedGroupBlock42Max(AbstractTestCmsDegradedGroups):
+ erasure = Erasure.BLOCK_4_2
+ mode = EAvailabilityMode.MODE_MAX_AVAILABILITY
+
+
+class TestDegradedGroupBlock42Keep(AbstractTestCmsDegradedGroups):
+ erasure = Erasure.BLOCK_4_2
+ mode = EAvailabilityMode.MODE_KEEP_AVAILABLE
+
+
+class TestDegradedGroupMirror3dcMax(AbstractTestCmsDegradedGroups):
+ erasure = Erasure.MIRROR_3_DC
+ mode = EAvailabilityMode.MODE_MAX_AVAILABILITY
+
+
+class TestDegradedGroupMirror3dcKeep(AbstractTestCmsDegradedGroups):
+ erasure = Erasure.MIRROR_3_DC
+ mode = EAvailabilityMode.MODE_KEEP_AVAILABLE
diff --git a/ydb/tests/functional/cms/test_cms_restart.py b/ydb/tests/functional/cms/test_cms_restart.py
new file mode 100644
index 00000000000..30db244eaed
--- /dev/null
+++ b/ydb/tests/functional/cms/test_cms_restart.py
@@ -0,0 +1,94 @@
+import logging
+import time
+
+from hamcrest import assert_that
+
+from ydb.core.protos.cms_pb2 import EAvailabilityMode
+
+from ydb.tests.library.common.types import Erasure
+from ydb.tests.library.common.protobuf import KVRequest
+import ydb.tests.library.common.cms as cms
+from ydb.tests.library.harness.kikimr_config import KikimrConfigGenerator
+from ydb.tests.library.harness.util import LogLevels
+from ydb.tests.library.harness.kikimr_cluster import kikimr_cluster_factory
+from ydb.tests.library.kv.helpers import create_tablets_and_wait_for_start
+from ydb.tests.library.common.delayed import wait_tablets_are_active
+from ydb.tests.library.matchers.response import is_ok_response
+
+import utils
+
+logger = logging.getLogger(__name__)
+
+
+class AbstractLocalClusterTest(object):
+ erasure = None
+ mode = None
+
+ @classmethod
+ def setup_class(cls):
+ nodes_count = 8 if cls.erasure == Erasure.BLOCK_4_2 else 9
+ nodes_count *= 2
+ configurator = KikimrConfigGenerator(cls.erasure,
+ nodes=nodes_count,
+ additional_log_configs={'CMS': LogLevels.DEBUG},
+ )
+ cls.cluster = kikimr_cluster_factory(configurator=configurator)
+ cls.cluster.start()
+
+ time.sleep(120)
+ cms.request_increase_ratio_limit(cls.cluster.client)
+
+ @classmethod
+ def teardown_class(cls):
+ cls.cluster.stop()
+
+
+class AbstractTestCmsStateStorageRestarts(AbstractLocalClusterTest):
+ def test_restart_as_much_as_can(self):
+ number_of_tablets = 20
+ tablet_ids = create_tablets_and_wait_for_start(
+ self.cluster.client, number_of_tablets,
+ batch_size=number_of_tablets,
+ timeout_seconds=120
+ )
+
+ restart_nodes = cms.request_shutdown_as_much_as_possible(self.cluster.client, self.cluster.nodes.keys(), type(self).mode)
+
+ for node in restart_nodes:
+ self.cluster.nodes[node].stop()
+
+ client = utils.create_client_from_alive_hosts(self.cluster, restart_nodes)
+
+ for tablet_id in tablet_ids:
+ client.tablet_kill(tablet_id)
+
+ for tablet_id in tablet_ids:
+ resp = client.kv_request(
+ tablet_id, KVRequest().write(bytes("key", 'utf-8'), bytes(utils.value_for("key", tablet_id), 'utf-8'))
+ )
+ assert_that(resp, is_ok_response())
+
+ for node in restart_nodes:
+ self.cluster.nodes[node].start()
+
+ wait_tablets_are_active(self.cluster.client, tablet_ids)
+
+
+class TestCmsStateStorageRestartsBlockMax(AbstractTestCmsStateStorageRestarts):
+ erasure = Erasure.BLOCK_4_2
+ mode = EAvailabilityMode.MODE_MAX_AVAILABILITY
+
+
+class TestCmsStateStorageRestartsBlockKeep(AbstractTestCmsStateStorageRestarts):
+ erasure = Erasure.BLOCK_4_2
+ mode = EAvailabilityMode.MODE_KEEP_AVAILABLE
+
+
+class TestCmsStateStorageRestartsMirrorMax(AbstractTestCmsStateStorageRestarts):
+ erasure = Erasure.MIRROR_3_DC
+ mode = EAvailabilityMode.MODE_MAX_AVAILABILITY
+
+
+class TestCmsStateStorageRestartsMirrorKeep(AbstractTestCmsStateStorageRestarts):
+ erasure = Erasure.MIRROR_3_DC
+ mode = EAvailabilityMode.MODE_KEEP_AVAILABLE
diff --git a/ydb/tests/functional/cms/test_cms_state_storage.py b/ydb/tests/functional/cms/test_cms_state_storage.py
new file mode 100644
index 00000000000..75c2b922b89
--- /dev/null
+++ b/ydb/tests/functional/cms/test_cms_state_storage.py
@@ -0,0 +1,70 @@
+import logging
+import time
+
+from ydb.core.protos.cms_pb2 import EAvailabilityMode
+
+from ydb.tests.library.common.types import Erasure
+import ydb.tests.library.common.cms as cms
+from ydb.tests.library.harness.kikimr_config import KikimrConfigGenerator
+from ydb.tests.library.harness.util import LogLevels
+from ydb.tests.library.harness.kikimr_cluster import kikimr_cluster_factory
+from ydb.tests.library.kv.helpers import create_tablets_and_wait_for_start
+from ydb.tests.library.common.delayed import wait_tablets_are_active
+
+import utils
+
+logger = logging.getLogger(__name__)
+
+
+class AbstractLocalClusterTest(object):
+ erasure = None
+ mode = None
+
+ @classmethod
+ def setup_class(cls):
+ configurator = KikimrConfigGenerator(Erasure.NONE,
+ nodes=27,
+ additional_log_configs={'CMS': LogLevels.DEBUG},
+ state_storage_rings=[[n, n + 1, n + 2] for n in range(1, 27, 3)]
+ )
+ cls.cluster = kikimr_cluster_factory(configurator=configurator)
+ cls.cluster.start()
+
+ time.sleep(120)
+ cms.request_increase_ratio_limit(cls.cluster.client)
+
+ @classmethod
+ def teardown_class(cls):
+ cls.cluster.stop()
+
+
+class AbstractTestCmsStateStorageSimple(AbstractLocalClusterTest):
+ def test_check_shutdown_state_storage_nodes(self):
+ number_of_tablets = 10
+ tablet_ids = create_tablets_and_wait_for_start(
+ self.cluster.client, number_of_tablets,
+ batch_size=number_of_tablets,
+ timeout_seconds=120
+ )
+
+ allowed_hosts = cms.request_shutdown_nodes(self.cluster.client,
+ self.cluster.nodes.keys(),
+ type(self).mode)
+
+ for node in allowed_hosts:
+ self.cluster.nodes[node].stop()
+
+ client = utils.create_client_from_alive_hosts(self.cluster, allowed_hosts)
+
+ for tablet_id in tablet_ids:
+ client.tablet_kill(tablet_id)
+
+ wait_tablets_are_active(client, tablet_ids)
+
+
+class TestCmsStateStorageSimpleKeep(AbstractTestCmsStateStorageSimple):
+ mode = EAvailabilityMode.MODE_KEEP_AVAILABLE
+
+
+class TestCmsStateStorageSimpleMax(AbstractTestCmsStateStorageSimple):
+ mode = EAvailabilityMode.MODE_MAX_AVAILABILITY
diff --git a/ydb/tests/functional/cms/utils.py b/ydb/tests/functional/cms/utils.py
new file mode 100644
index 00000000000..5eacd735762
--- /dev/null
+++ b/ydb/tests/functional/cms/utils.py
@@ -0,0 +1,18 @@
+from ydb.tests.library.harness.kikimr_client import kikimr_client_factory
+
+
+def value_for(key, tablet_id):
+ return "Value: <key = {key}, tablet_id = {tablet_id}>".format(
+ key=key, tablet_id=tablet_id)
+
+
+def create_client_from_alive_hosts(cluster, alive_nodeids):
+ alive_node = cluster.nodes[1]
+ for node_id, node in cluster.nodes.items():
+ if node_id not in alive_nodeids:
+ alive_node = node
+ break
+ assert alive_node not in alive_nodeids
+ client = kikimr_client_factory(alive_node.host, alive_node.grpc_port, retry_count=100)
+
+ return client
diff --git a/ydb/tests/library/common/cms.py b/ydb/tests/library/common/cms.py
new file mode 100644
index 00000000000..1f1fa9b50e6
--- /dev/null
+++ b/ydb/tests/library/common/cms.py
@@ -0,0 +1,57 @@
+import logging
+import copy
+
+import ydb.tests.library.common.protobuf_cms as cms_pb
+from ydb.core.protos import cms_pb2
+
+logger = logging.getLogger(__name__)
+
+
+def request_increase_ratio_limit(client):
+ req = cms_pb.CmsConfigRequest()
+ client.send_request(req.protobuf, "CmsRequest")
+
+
+def request_nodes(client, node_ids, mode, type):
+ req = cms_pb.CmsPermissionRequest()
+ for node_id in node_ids:
+ req.add_action(str(node_id), type)
+ req.set_mode(mode)
+
+ logger.info("Sending permission request to CMS: %s", req.protobuf)
+ resp = client.send_request(req.protobuf, "CmsRequest")
+ logger.info("Got response from CMS: %s", resp.PermissionResponse)
+
+ nodes = []
+ for perm in resp.PermissionResponse.Permissions:
+ nodes.append(int(perm.Action.Host))
+
+ return nodes
+
+
+def request_shutdown_nodes(client, node_ids, mode):
+ return request_nodes(client, node_ids, mode, cms_pb2.TAction.SHUTDOWN_HOST)
+
+
+def request_restart_services(client, node_ids, mode):
+ return request_nodes(client, node_ids, mode, cms_pb2.TAction.RESTART_SERVICES)
+
+
+def request_as_much_as_possible(client, node_ids, mode, type):
+ not_allowed = copy.deepcopy(list(node_ids))
+ restart_nodes = []
+ allowed_nodes = request_nodes(client, node_ids, mode, type)
+
+ while len(allowed_nodes) > 0:
+ restart_nodes.extend(allowed_nodes)
+
+ for node_id in allowed_nodes:
+ not_allowed.remove(node_id)
+
+ allowed_nodes = request_nodes(client, not_allowed, mode, type)
+
+ return restart_nodes
+
+
+def request_shutdown_as_much_as_possible(client, node_ids, mode):
+ return request_as_much_as_possible(client, node_ids, mode, cms_pb2.TAction.SHUTDOWN_HOST)
diff --git a/ydb/tests/library/common/protobuf_cms.py b/ydb/tests/library/common/protobuf_cms.py
new file mode 100644
index 00000000000..f51590b45b8
--- /dev/null
+++ b/ydb/tests/library/common/protobuf_cms.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from ydb.core.protos import cms_pb2
+import ydb.core.protos.msgbus_pb2 as msgbus
+from ydb.tests.library.common.protobuf import AbstractProtobufBuilder
+
+
+class CmsPermissionRequest(AbstractProtobufBuilder):
+ def __init__(self):
+ super(CmsPermissionRequest, self).__init__(
+ msgbus.TCmsRequest())
+
+ self.protobuf.PermissionRequest.User = "user"
+ self.protobuf.PermissionRequest.Schedule = False
+ self.protobuf.PermissionRequest.DryRun = False
+ self.protobuf.PermissionRequest.PartialPermissionAllowed = True
+
+ def add_action(self, host, type):
+ action = self.protobuf.PermissionRequest.Actions.add()
+ action.Duration = 60000000
+ action.Type = type
+ action.Host = host
+ if action == cms_pb2.TAction.RESTART_SERVICES:
+ action.Services = "storage"
+
+ def set_mode(self, mode):
+ self.protobuf.PermissionRequest.AvailabilityMode = mode
+
+
+class CmsConfigRequest(AbstractProtobufBuilder):
+ def __init__(self):
+ super(CmsConfigRequest, self).__init__(msgbus.TCmsRequest())
+
+ self.protobuf.SetConfigRequest.Config.ClusterLimits.DisabledNodesRatioLimit = 100
+ self.protobuf.SetConfigRequest.Config.TenantLimits.DisabledNodesRatioLimit = 100
+
+
+class CmsStateRequest(AbstractProtobufBuilder):
+ def __init__(self):
+ super(CmsStateRequest, self).__init__(msgbus.TCmsRequest())
+
+ self.protobuf.ClusterStateRequest.CopyFrom(cms_pb2.TClusterStateRequest())
diff --git a/ydb/tests/library/harness/kikimr_config.py b/ydb/tests/library/harness/kikimr_config.py
index f436bd14147..2ff49601b6d 100644
--- a/ydb/tests/library/harness/kikimr_config.py
+++ b/ydb/tests/library/harness/kikimr_config.py
@@ -6,6 +6,7 @@ import tempfile
import socket
import six
import yaml
+import copy
from pkg_resources import resource_string
from google.protobuf.text_format import Parse
@@ -58,8 +59,7 @@ def get_grpc_host():
return "[::]"
-def load_default_yaml(default_tablet_node_ids, ydb_domain_name, static_erasure, n_to_select, state_storage_nodes,
- log_configs):
+def load_default_yaml(default_tablet_node_ids, ydb_domain_name, static_erasure, log_configs):
data = resource_string(__name__, "resources/default_yaml.yml")
if isinstance(data, bytes):
data = data.decode('utf-8')
@@ -71,8 +71,6 @@ def load_default_yaml(default_tablet_node_ids, ydb_domain_name, static_erasure,
ydb_default_log_level=int(LogLevels.from_string(os.getenv("YDB_DEFAULT_LOG_LEVEL", "NOTICE"))),
ydb_domain_name=ydb_domain_name,
ydb_static_erasure=static_erasure,
- ydb_state_storage_n_to_select=n_to_select,
- ydb_state_storage_nodes=state_storage_nodes,
ydb_grpc_host=get_grpc_host(),
ydb_pq_topics_are_first_class_citizen=bool(os.getenv("YDB_PQ_TOPICS_ARE_FIRST_CLASS_CITIZEN", "true")),
ydb_pq_cluster_table_path=str(os.getenv("YDB_PQ_CLUSTER_TABLE_PATH", "")),
@@ -120,6 +118,7 @@ class KikimrConfigGenerator(object):
dynamic_pdisk_size=PDISK_SIZE,
dynamic_pdisks=[],
dynamic_storage_pools=[dict(name="dynamic_storage_pool:1", kind="hdd", pdisk_user_kind=0)],
+ state_storage_rings=None,
n_to_select=None,
use_log_files=True,
grpc_ssl_enable=False,
@@ -178,6 +177,9 @@ class KikimrConfigGenerator(object):
self.n_to_select = 9
else:
self.n_to_select = min(5, nodes)
+ self.state_storage_rings = state_storage_rings
+ if self.state_storage_rings is None:
+ self.state_storage_rings = copy.deepcopy(self.__node_ids[: 9 if erasure == Erasure.MIRROR_3_DC else 8])
self.__use_in_memory_pdisks = use_in_memory_pdisks or os.getenv('YDB_USE_IN_MEMORY_PDISKS') == 'true'
self.__pdisks_directory = os.getenv('YDB_PDISKS_DIRECTORY')
self.static_erasure = erasure
@@ -205,8 +207,7 @@ class KikimrConfigGenerator(object):
self.__bs_cache_file_path = bs_cache_file_path
- self.yaml_config = load_default_yaml(self.__node_ids, self.domain_name, self.static_erasure, self.n_to_select,
- self.__node_ids, self.__additional_log_configs)
+ self.yaml_config = load_default_yaml(self.__node_ids, self.domain_name, self.static_erasure, self.__additional_log_configs)
self.yaml_config["feature_flags"]["enable_public_api_external_blobs"] = enable_public_api_external_blobs
self.yaml_config["feature_flags"]["enable_mvcc"] = "VALUE_FALSE" if disable_mvcc else "VALUE_TRUE"
self.yaml_config['pqconfig']['enabled'] = enable_pq
@@ -447,6 +448,13 @@ class KikimrConfigGenerator(object):
def all_node_ids(self):
return self.__node_ids
+ def _add_state_storage_config(self):
+ self.yaml_config["domains_config"]["state_storage"] = []
+ self.yaml_config["domains_config"]["state_storage"].append({"ssid" : 1, "ring" : {"nto_select" : self.n_to_select, "ring" : []}})
+
+ for ring in self.state_storage_rings:
+ self.yaml_config["domains_config"]["state_storage"][0]["ring"]["ring"].append({"node" : ring if isinstance(ring, list) else [ring], "use_ring_specific_node_selection" : True})
+
def _add_pdisk_to_static_group(self, pdisk_id, path, node_id, pdisk_category, ring):
domain_id = len(
self.yaml_config['blob_storage_config']["service_set"]["groups"][0]["rings"][ring]["fail_domains"])
@@ -476,12 +484,14 @@ class KikimrConfigGenerator(object):
self.yaml_config["blob_storage_config"]["service_set"]["pdisks"] = []
self.yaml_config["blob_storage_config"]["service_set"]["vdisks"] = []
self.yaml_config["blob_storage_config"]["service_set"]["groups"] = [
- {"group_id": 0, 'group_generation': 0, 'erasure_species': int(self.static_erasure)}]
+ {"group_id": 0, 'group_generation': 1, 'erasure_species': int(self.static_erasure)}]
self.yaml_config["blob_storage_config"]["service_set"]["groups"][0]["rings"] = []
for dc in self._dcs:
self.yaml_config["blob_storage_config"]["service_set"]["groups"][0]["rings"].append({"fail_domains": []})
+ self._add_state_storage_config()
+
for node_id in self.__node_ids:
datacenter_id = next(datacenter_id_generator)
diff --git a/ydb/tests/library/harness/resources/default_domains.txt b/ydb/tests/library/harness/resources/default_domains.txt
index 802acce0966..5c288afa983 100644
--- a/ydb/tests/library/harness/resources/default_domains.txt
+++ b/ydb/tests/library/harness/resources/default_domains.txt
@@ -70,7 +70,10 @@
SSId: 1
Ring {
NToSelect: 1
- Node: 1
+ Ring {
+ Node: 1
+ UseRingSpecificNodeSelection: True
+ }
}
}
HiveConfig {
diff --git a/ydb/tests/library/harness/resources/default_yaml.yml b/ydb/tests/library/harness/resources/default_yaml.yml
index 779f389e979..1901b1e2edd 100644
--- a/ydb/tests/library/harness/resources/default_yaml.yml
+++ b/ydb/tests/library/harness/resources/default_yaml.yml
@@ -35,11 +35,6 @@ channel_profile_config:
pdisk_category: 0
storage_pool_kind: "hdd"
domains_config:
- state_storage:
- - ssid: 1
- ring:
- nto_select: {ydb_state_storage_n_to_select}
- node: {ydb_state_storage_nodes}
domain:
- name: "{ydb_domain_name}"
domain_id: 1