diff options
author | Maxim Yurchuk <maxim-yurchuk@ydb.tech> | 2024-12-11 11:55:05 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-12-11 14:55:05 +0300 |
commit | 410cfb65f31f95776ceba2c28a6a9df5aa7188b3 (patch) | |
tree | e448438d8093345b4d41728a74c953f642c0f0fa | |
parent | 44ec704a56a807ac346d0873ac86965a9fbd7e15 (diff) | |
download | ydb-410cfb65f31f95776ceba2c28a6a9df5aa7188b3.tar.gz |
Standalone tool for nemesis (#12502)
-rw-r--r-- | ydb/tests/stability/library/__main__.py | 202 | ||||
-rw-r--r-- | ydb/tests/stability/library/ya.make | 40 | ||||
-rw-r--r-- | ydb/tests/stability/ya.make | 1 |
3 files changed, 243 insertions, 0 deletions
diff --git a/ydb/tests/stability/library/__main__.py b/ydb/tests/stability/library/__main__.py new file mode 100644 index 00000000000..fbe66af6931 --- /dev/null +++ b/ydb/tests/stability/library/__main__.py @@ -0,0 +1,202 @@ +# -*- coding: utf-8 -*- + +import getpass +import os +import logging +import tempfile +import stat +import sys +import argparse + +from library.python import resource + +logging.getLogger().setLevel(logging.DEBUG) +logging.getLogger().addHandler(logging.StreamHandler(sys.stderr)) + +from ydb.tests.library.common.composite_assert import CompositeAssert # noqa +from ydb.tests.library.harness.kikimr_cluster import ExternalKiKiMRCluster # noqa +from ydb.tests.library.matchers.collection import is_empty # noqa +from ydb.tests.library.wardens.factories import safety_warden_factory, liveness_warden_factory # noqa + +logger = logging.getLogger('ydb.connection') +logger.setLevel(logging.CRITICAL) +logger = logging.getLogger(__name__) + + +STRESS_BINARIES_DEPLOY_PATH = '/Berkanavt/nemesis/bin/' + + +class StabilityCluster: + def __init__(self, ssh_username, cluster_path, ydbd_path, ydbd_next_path=None): + self.working_dir = os.path.join(tempfile.gettempdir(), "ydb_stability") + os.makedirs(self.working_dir, exist_ok=True) + self.ssh_username = ssh_username + self.slice_directory = cluster_path + self.ydbd_path = ydbd_path + self.ydbd_next_path = ydbd_next_path + + self.artifacts = ( + self._unpack_resource('nemesis'), + self._unpack_resource('simple_queue'), + self._unpack_resource('olap_workload'), + self._unpack_resource('statistics_workload'), + ) + + self.kikimr_cluster = ExternalKiKiMRCluster( + config_path=self.slice_directory, + kikimr_configure_binary_path=self._unpack_resource("cfg"), + kikimr_path=self.ydbd_path, + kikimr_next_path=self.ydbd_next_path, + ssh_username=self.ssh_username, + deploy_cluster=True, + ) + + def _unpack_resource(self, name): + res = resource.find(name) + path_to_unpack = os.path.join(self.working_dir, name) + with open(path_to_unpack, "wb") as f: + f.write(res) + + st = os.stat(path_to_unpack) + os.chmod(path_to_unpack, st.st_mode | stat.S_IEXEC) + return path_to_unpack + + def start_nemesis(self): + for node in self.kikimr_cluster.nodes.values(): + node.ssh_command("sudo service nemesis restart", raise_on_error=True) + + def stop_nemesis(self): + for node in self.kikimr_cluster.nodes.values(): + node.ssh_command("sudo service nemesis stop", raise_on_error=False) + + def setup(self, is_deploy_cluster): + self._stop_nemesis() + self.kikimr_cluster.start() + + if is_deploy_cluster: + # cleanup nemesis logs + for node in self.kikimr_cluster.nodes.values(): + node.ssh_command('sudo rm -rf /Berkanavt/nemesis/logs/*', raise_on_error=False) + node.ssh_command('sudo pkill screen', raise_on_error=False) + + with open(self._unpack_resource("tbl_profile.txt")) as f: + self.kikimr_cluster.client.console_request(f.read()) + self.kikimr_cluster.client.update_self_heal(True) + + node = list(self.kikimr_cluster.nodes.values())[0] + node.ssh_command("/Berkanavt/kikimr/bin/kikimr admin console validator disable bootstrap", raise_on_error=True) + + for node in self.kikimr_cluster.nodes.values(): + node.ssh_command(["sudo", "mkdir", "-p", STRESS_BINARIES_DEPLOY_PATH], raise_on_error=False) + for artifact in self.artifacts: + node.copy_file_or_dir( + artifact, + os.path.join( + STRESS_BINARIES_DEPLOY_PATH, + os.path.basename( + artifact + ) + ) + ) + + +def path_type(path): + # Expand the user's home directory if ~ is present + expanded_path = os.path.expanduser(path) + # Check if the file exists + if not os.path.exists(expanded_path): + raise argparse.ArgumentTypeError(f"The file {expanded_path} does not exist.") + return expanded_path + + +def parse_args(): + parser = argparse.ArgumentParser( + description="""Chaos and cross-version testing tool""" + ) + parser.add_argument( + "--cluster_path", + required=True, + type=path_type, + help="Path to cluster.yaml", + ) + parser.add_argument( + "--ydbd_path", + required=True, + type=path_type, + help="Path to ydbd", + ) + parser.add_argument( + "--ssh_user", + required=False, + default=getpass.getuser(), + type=str, + help="Path to ydbd", + ) + parser.add_argument( + "actions", + type=str, + nargs="+", + choices=[ + "start_nemesis", + "stop_nemesis", + "start_workload_simple_queue", + "stop_workload", + "perform_checks", + ], + help="actions to execute", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + ssh_username = args.ssh_user + stability_cluster = StabilityCluster( + ssh_username=ssh_username, + cluster_path=args.cluster_path, + ydbd_path=args.ydbd_path, + ) + + for action in args.actions: + setup_command = False + if setup_command: + stability_cluster.setup(True) + + if action == "start_workload_simple_queue": + for node_id, node in enumerate(stability_cluster.kikimr_cluster.nodes.values()): + node.ssh_command( + 'screen -d -m bash -c "while true; do /Berkanavt/nemesis/bin/simple_queue --database /Root/db1 ; done"', + raise_on_error=True + ) + if action == "stop_workload": + for node_id, node in enumerate(stability_cluster.kikimr_cluster.nodes.values()): + node.ssh_command( + 'sudo pkill screen', + raise_on_error=True + ) + + if action == "stop_nemesis": + stability_cluster.stop_nemesis() + + if action == "start_nemesis": + stability_cluster.start_nemesis() + + if action == "perform_checks": + composite_assert = CompositeAssert() + composite_assert.assert_that( + safety_warden_factory(stability_cluster.kikimr_cluster, ssh_username).list_of_safety_violations(), + is_empty(), + "No safety violations by Safety Warden" + ) + + composite_assert.assert_that( + liveness_warden_factory(stability_cluster.kikimr_cluster, ssh_username).list_of_liveness_violations, + is_empty(), + "No liveness violations by liveness warden", + ) + + composite_assert.finish() + + +if __name__ == "__main__": + main() diff --git a/ydb/tests/stability/library/ya.make b/ydb/tests/stability/library/ya.make new file mode 100644 index 00000000000..8859105d764 --- /dev/null +++ b/ydb/tests/stability/library/ya.make @@ -0,0 +1,40 @@ +PY3_PROGRAM() + +PY_SRCS( + __main__.py +) + +DATA( + arcadia/ydb/tests/stability/resources +) + +DEPENDS( + ydb/tools/cfg/bin + ydb/tests/tools/nemesis/driver +) + +BUNDLE( + ydb/tools/simple_queue NAME simple_queue + ydb/tools/olap_workload NAME olap_workload + ydb/tools/statistics_workload NAME statistics_workload + ydb/tools/cfg/bin NAME cfg + ydb/tests/tools/nemesis/driver NAME nemesis +) + +RESOURCE( + simple_queue simple_queue + olap_workload olap_workload + statistics_workload statistics_workload + cfg cfg + nemesis nemesis + ydb/tests/stability/resources/tbl_profile.txt tbl_profile.txt +) + + +PEERDIR( + ydb/tests/library + ydb/tests/library/wardens +) + +END() + diff --git a/ydb/tests/stability/ya.make b/ydb/tests/stability/ya.make index 8b83b3f649a..051f8bd7d80 100644 --- a/ydb/tests/stability/ya.make +++ b/ydb/tests/stability/ya.make @@ -1,3 +1,4 @@ RECURSE( ydb + library ) |