diff options
author | senya0x5f <senya0x5f@yandex-team.com> | 2023-08-22 19:53:37 +0300 |
---|---|---|
committer | senya0x5f <senya0x5f@yandex-team.com> | 2023-08-22 20:23:16 +0300 |
commit | b4bdd3643d5a44e1440cad0d7c724b3b3de350c4 (patch) | |
tree | b4fb23990083a5118622b9f8708e6a72d586c06f | |
parent | 24dd18e2743431eef40c048bb9036afb2ca11d78 (diff) | |
download | ydb-b4bdd3643d5a44e1440cad0d7c724b3b3de350c4.tar.gz |
KIKIMR-17302 Ignore bad serials, log them, monitor them
-rw-r--r-- | ydb/core/blobstorage/nodewarden/blobstorage_node_warden_ut.cpp | 123 | ||||
-rw-r--r-- | ydb/core/blobstorage/nodewarden/node_warden_impl.cpp | 84 | ||||
-rw-r--r-- | ydb/core/blobstorage/nodewarden/node_warden_impl.h | 33 | ||||
-rw-r--r-- | ydb/core/blobstorage/pdisk/blobstorage_pdisk_util_ut.cpp | 3 | ||||
-rw-r--r-- | ydb/library/pdisk_io/file_params.h | 2 | ||||
-rw-r--r-- | ydb/library/pdisk_io/file_params_darwin.cpp | 2 | ||||
-rw-r--r-- | ydb/library/pdisk_io/file_params_linux.cpp | 13 | ||||
-rw-r--r-- | ydb/library/pdisk_io/file_params_win.cpp | 2 |
8 files changed, 253 insertions, 9 deletions
diff --git a/ydb/core/blobstorage/nodewarden/blobstorage_node_warden_ut.cpp b/ydb/core/blobstorage/nodewarden/blobstorage_node_warden_ut.cpp index e8aaee7dd39..8e71f77d277 100644 --- a/ydb/core/blobstorage/nodewarden/blobstorage_node_warden_ut.cpp +++ b/ydb/core/blobstorage/nodewarden/blobstorage_node_warden_ut.cpp @@ -11,11 +11,13 @@ #include <ydb/core/base/statestorage_impl.h> #include <ydb/core/blobstorage/crypto/default.h> #include <ydb/core/blobstorage/nodewarden/node_warden.h> +#include <ydb/core/blobstorage/nodewarden/node_warden_impl.h> #include <ydb/core/blobstorage/base/blobstorage_events.h> #include <ydb/core/blobstorage/pdisk/blobstorage_pdisk_tools.h> #include <ydb/core/blobstorage/pdisk/blobstorage_pdisk_ut_http_request.h> #include <ydb/core/mind/bscontroller/bsc.h> #include <ydb/core/mind/local.h> +#include <ydb/core/util/testactorsys.h> #include <ydb/library/pdisk_io/sector_map.h> #include <util/random/entropy.h> @@ -483,6 +485,127 @@ Y_UNIT_TEST_SUITE(TBlobStorageWardenTest) { BlockGroup(runtime, sender0, tabletId, groupId, generation++, true, NKikimrProto::EReplyStatus::NO_GROUP); } + CUSTOM_UNIT_TEST(TestFilterBadSerials) { + TTestActorSystem runtime(1); + runtime.Start(); + + TIntrusivePtr<TNodeWardenConfig> nodeWardenConfig(new TNodeWardenConfig(static_cast<IPDiskServiceFactory*>(new TRealPDiskServiceFactory()))); + + IActor* ac = CreateBSNodeWarden(nodeWardenConfig.Release()); + + TActorId nodeWarden = runtime.Register(ac, 1); + + runtime.WrapInActorContext(nodeWarden, [](IActor* wardenActor) { + auto vectorsEqual = [](const TVector<TString>& vec1, const TVector<TString>& vec2) { + TVector<TString> sortedVec1 = vec1; + TVector<TString> sortedVec2 = vec2; + + std::sort(sortedVec1.begin(), sortedVec1.end()); + std::sort(sortedVec2.begin(), sortedVec2.end()); + + return sortedVec1 == sortedVec2; + }; + + auto checkHasOnlyGoodDrive = [](TVector<NPDisk::TDriveData>& drives) { + UNIT_ASSERT_EQUAL(1, drives.size()); + UNIT_ASSERT_EQUAL(drives[0].Path, "/good1"); + }; + + NStorage::TNodeWarden& warden = *dynamic_cast<NStorage::TNodeWarden*>(wardenActor); + + NPDisk::TDriveData goodDrive1; + goodDrive1.Path = "/good1"; + goodDrive1.SerialNumber = "FOOBAR"; + + NPDisk::TDriveData goodDrive2; + goodDrive2.Path = "/good2"; + goodDrive2.SerialNumber = "BARFOO"; + + NPDisk::TDriveData badDrive1; + badDrive1.Path = "/bad1"; + char s[] = {50, 51, 52, -128}; // Non-ASCII character -128. + badDrive1.SerialNumber = TString(s); + + NPDisk::TDriveData badDrive2; + badDrive2.Path = "/bad2"; + badDrive2.SerialNumber = "NOT\tGOOD"; // Non-printable character \t. + + NPDisk::TDriveData badDrive3; + badDrive3.Path = "/bad3"; + badDrive3.SerialNumber = TString(101, 'F'); // Size exceeds 100. + + NPDisk::TDriveData badDrive4; + badDrive4.Path = "/bad4"; + badDrive4.SerialNumber = "NOTGOODEITHER"; + badDrive4.SerialNumber[5] = '\0'; // Unexpected null-terminator. + + TStringStream details; + + // Check for zero drives. + { + TVector<NPDisk::TDriveData> drives; + warden.RemoveDrivesWithBadSerialsAndReport(drives, details); + + UNIT_ASSERT_EQUAL(0, drives.size()); + UNIT_ASSERT_EQUAL(0, warden.DrivePathCounterKeys().size()); + } + + // If a drive is not present in a subsequent call, then it is removed from a counters map. + // We check both serial number validator and also that counters are removed for missing drives. + { + TVector<NPDisk::TDriveData> drives = {goodDrive1, badDrive1}; + warden.RemoveDrivesWithBadSerialsAndReport(drives, details); + + checkHasOnlyGoodDrive(drives); + UNIT_ASSERT(vectorsEqual(warden.DrivePathCounterKeys(), {"/good1", "/bad1"})); + } + { + TVector<NPDisk::TDriveData> drives = {goodDrive1, badDrive2}; + warden.RemoveDrivesWithBadSerialsAndReport(drives, details); + + checkHasOnlyGoodDrive(drives); + UNIT_ASSERT(vectorsEqual(warden.DrivePathCounterKeys(), {"/good1", "/bad2"})); + } + { + TVector<NPDisk::TDriveData> drives = {goodDrive1, badDrive3}; + warden.RemoveDrivesWithBadSerialsAndReport(drives, details); + + checkHasOnlyGoodDrive(drives); + UNIT_ASSERT(vectorsEqual(warden.DrivePathCounterKeys(), {"/good1", "/bad3"})); + } + { + TVector<NPDisk::TDriveData> drives = {goodDrive1, badDrive4}; + warden.RemoveDrivesWithBadSerialsAndReport(drives, details); + + checkHasOnlyGoodDrive(drives); + UNIT_ASSERT(vectorsEqual(warden.DrivePathCounterKeys(), {"/good1", "/bad4"})); + } + { + TVector<NPDisk::TDriveData> drives = {goodDrive1, goodDrive2, badDrive4}; + warden.RemoveDrivesWithBadSerialsAndReport(drives, details); + + UNIT_ASSERT_EQUAL(2, drives.size()); + UNIT_ASSERT(vectorsEqual(warden.DrivePathCounterKeys(), {"/good1", "/good2", "/bad4"})); + } + // Check that good drives can also be removed from counters map. + { + TVector<NPDisk::TDriveData> drives = {goodDrive1, badDrive4}; + warden.RemoveDrivesWithBadSerialsAndReport(drives, details); + + checkHasOnlyGoodDrive(drives); + UNIT_ASSERT(vectorsEqual(warden.DrivePathCounterKeys(), {"/good1", "/bad4"})); + } + // Check that everything is removed if there are no drives. + { + TVector<NPDisk::TDriveData> drives; + warden.RemoveDrivesWithBadSerialsAndReport(drives, details); + + UNIT_ASSERT_EQUAL(0, drives.size()); + UNIT_ASSERT_EQUAL(0, warden.DrivePathCounterKeys().size()); + } + }); + } + CUSTOM_UNIT_TEST(TestSendToInvalidGroupId) { TTestBasicRuntime runtime(1, false); Setup(runtime, "", nullptr); diff --git a/ydb/core/blobstorage/nodewarden/node_warden_impl.cpp b/ydb/core/blobstorage/nodewarden/node_warden_impl.cpp index eb0edf6a5cd..1c3f483f9de 100644 --- a/ydb/core/blobstorage/nodewarden/node_warden_impl.cpp +++ b/ydb/core/blobstorage/nodewarden/node_warden_impl.cpp @@ -7,8 +7,88 @@ using namespace NKikimr; using namespace NStorage; +void TNodeWarden::RemoveDrivesWithBadSerialsAndReport(TVector<NPDisk::TDriveData>& drives, TStringStream& details) { + // Serial number's size definitely won't exceed this number of bytes. + size_t maxSerialSizeInBytes = 100; + + auto isValidSerial = [maxSerialSizeInBytes](TString& serial) { + if (serial.Size() > maxSerialSizeInBytes) { + // Not sensible size. + return false; + } + + // Check if serial number contains only ASCII characters. + for (size_t i = 0; i < serial.Size(); ++i) { + i8 c = serial[i]; + + if (c <= 0) { + // Encountered null terminator earlier than expected or non-ASCII character. + return false; + } + + if (!isprint(c)) { + // Encountered non-printable character. + return false; + } + } + + return true; + }; + + std::unordered_set<TString> drivePaths; + + for (const auto& drive : drives) { + drivePaths.insert(drive.Path); + } + + // Remove counters for drives that are no longer present. + for (auto countersIt = ByPathDriveCounters.begin(); countersIt != ByPathDriveCounters.end();) { + if (drivePaths.find(countersIt->first) == drivePaths.end()) { + countersIt = ByPathDriveCounters.erase(countersIt); + } else { + countersIt++; + } + } + + // Prepare removal of drives with invalid serials. + auto toRemove = std::remove_if(drives.begin(), drives.end(), [&isValidSerial](auto& driveData) { + TString& serial = driveData.SerialNumber; + + return !isValidSerial(serial); + }); + + // Add counters for every drive path. + for (auto it = drives.begin(); it != toRemove; ++it) { + TString& path = it->Path; + ByPathDriveCounters.try_emplace(path, AppData()->Counters, path); + } + + // And for drives with invalid serials log serial and report to the monitoring. + for (auto it = toRemove; it != drives.end(); ++it) { + TString& serial = it->SerialNumber; + TString& path = it->Path; + + auto [mapIt, _] = ByPathDriveCounters.try_emplace(path, AppData()->Counters, path); + + // Cut string in case it exceeds max size. + size_t size = std::min(serial.Size(), maxSerialSizeInBytes); + + // Encode in case it contains weird symbols. + TString encoded = Base64Encode(serial.substr(0, size)); + + // Output bad serial number in base64 encoding. + STLOG(PRI_WARN, BS_NODE, NW03, "Bad serial number", (Path, path), (SerialBase64, encoded.Quote()), (Details, details.Str())); + + mapIt->second.BadSerialsRead->Inc(); + } + + // Remove drives with invalid serials. + drives.erase(toRemove, drives.end()); +} + TVector<NPDisk::TDriveData> TNodeWarden::ListLocalDrives() { - TVector<NPDisk::TDriveData> drives = ListDevicesWithPartlabel(); + TStringStream details; + TVector<NPDisk::TDriveData> drives = ListDevicesWithPartlabel(details); try { TString raw = TFileInput(MockDevicesPath).ReadAll(); @@ -29,6 +109,8 @@ TVector<NPDisk::TDriveData> TNodeWarden::ListLocalDrives() { return lhs.Path < rhs.Path; }); + RemoveDrivesWithBadSerialsAndReport(drives, details); + return drives; } diff --git a/ydb/core/blobstorage/nodewarden/node_warden_impl.h b/ydb/core/blobstorage/nodewarden/node_warden_impl.h index ec56a334f47..dbe59b5d3c4 100644 --- a/ydb/core/blobstorage/nodewarden/node_warden_impl.h +++ b/ydb/core/blobstorage/nodewarden/node_warden_impl.h @@ -63,12 +63,25 @@ namespace NKikimr::NStorage { {} }; + struct TDrivePathCounters { + ::NMonitoring::TDynamicCounters::TCounterPtr BadSerialsRead; + + TDrivePathCounters(const TIntrusivePtr<::NMonitoring::TDynamicCounters>& counters, const TString& path) { + auto driveGroup = GetServiceCounters(counters, "pdisks")->GetSubgroup("path", path); + + BadSerialsRead = driveGroup->GetCounter("BadSerialsRead"); + } + }; + class TNodeWarden : public TActorBootstrapped<TNodeWarden> { TIntrusivePtr<TNodeWardenConfig> Cfg; TIntrusivePtr<TDsProxyNodeMon> DsProxyNodeMon; TActorId DsProxyNodeMonActor; TIntrusivePtr<TDsProxyPerPoolCounters> DsProxyPerPoolCounters; + // Counters for drives by drive path. + TMap<TString, TDrivePathCounters> ByPathDriveCounters; + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// ui32 LocalNodeId; // NodeId for local node @@ -158,8 +171,28 @@ namespace NKikimr::NStorage { void StartVirtualGroupAgent(ui32 groupId); void StartStaticProxies(); + /** + * Removes drives with bad serial numbers and reports them to monitoring. + * + * This method scans a vector of drive data and checks for drives with bad serial numbers. + * Drives with bad serial numbers are removed from the vector and reported to the monitoring. + * + * @param drives A vector of data representing disk drives. + * @param details A string stream with details about drives. + */ + void RemoveDrivesWithBadSerialsAndReport(TVector<NPDisk::TDriveData>& drives, TStringStream& details); TVector<NPDisk::TDriveData> ListLocalDrives(); + TVector<TString> DrivePathCounterKeys() const { + TVector<TString> keys; + + for (const auto& [key, _] : ByPathDriveCounters) { + keys.push_back(key); + } + + return keys; + } + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Pipe management diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_util_ut.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_util_ut.cpp index 971947d374b..060d87243c3 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_util_ut.cpp +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_util_ut.cpp @@ -453,7 +453,8 @@ void TestPayloadOffset(ui64 firstSector, ui64 lastSector, ui64 currentSector, ui } Y_UNIT_TEST(TestDeviceList) { - for(const NPDisk::TDriveData& data : ListDevicesWithPartlabel()) { + TStringStream details; + for(const NPDisk::TDriveData& data : ListDevicesWithPartlabel(details)) { Cout << "data# " << data.ToString(false) << Endl; } } diff --git a/ydb/library/pdisk_io/file_params.h b/ydb/library/pdisk_io/file_params.h index 46dab2109ab..0199d341ab3 100644 --- a/ydb/library/pdisk_io/file_params.h +++ b/ydb/library/pdisk_io/file_params.h @@ -11,7 +11,7 @@ void DetectFileParameters(TString path, ui64 &outDiskSizeBytes, bool &outIsBlock std::optional<NPDisk::TDriveData> FindDeviceBySerialNumber(const TString& serial, bool partlabelOnly); -TVector<NPDisk::TDriveData> ListDevicesWithPartlabel(); +TVector<NPDisk::TDriveData> ListDevicesWithPartlabel(TStringStream& details); TVector<NPDisk::TDriveData> ListAllDevices(); diff --git a/ydb/library/pdisk_io/file_params_darwin.cpp b/ydb/library/pdisk_io/file_params_darwin.cpp index d300a153452..a33465da0da 100644 --- a/ydb/library/pdisk_io/file_params_darwin.cpp +++ b/ydb/library/pdisk_io/file_params_darwin.cpp @@ -56,7 +56,7 @@ std::optional<NPDisk::TDriveData> FindDeviceBySerialNumber(const TString& /*seri return {}; } -TVector<NPDisk::TDriveData> ListDevicesWithPartlabel() { +TVector<NPDisk::TDriveData> ListDevicesWithPartlabel(TStringStream& /*details*/) { return {}; } diff --git a/ydb/library/pdisk_io/file_params_linux.cpp b/ydb/library/pdisk_io/file_params_linux.cpp index 98c2ee62e5d..fd02a7a836e 100644 --- a/ydb/library/pdisk_io/file_params_linux.cpp +++ b/ydb/library/pdisk_io/file_params_linux.cpp @@ -83,7 +83,7 @@ static TVector<NPDisk::TDriveData> FilterOnlyUniqueSerial(TVector<NPDisk::TDrive return result; } -static TVector<NPDisk::TDriveData> ListDevices(const char *folder, const TString& serial, std::regex device_regex) { +static TVector<NPDisk::TDriveData> ListDevices(const char *folder, const TString& serial, std::regex device_regex, TStringStream& details) { TFsPath path(folder); TVector<TFsPath> children; try { @@ -94,17 +94,22 @@ static TVector<NPDisk::TDriveData> ListDevices(const char *folder, const TString TVector<NPDisk::TDriveData> devicesFound; for (const auto& child : children) { if (std::regex_match(child.GetName().c_str(), device_regex)) { - TStringStream details; std::optional<NPDisk::TDriveData> data = NPDisk::GetDriveData(child.GetPath(), &details); if (data && (!serial || data->SerialNumber == serial)) { devicesFound.push_back(*data); } + details << "#"; } } return FilterOnlyUniqueSerial(devicesFound); } +static TVector<NPDisk::TDriveData> ListDevices(const char *folder, const TString& serial, std::regex device_regex) { + TStringStream details; + return ListDevices(folder, serial, device_regex, details); +} + static std::optional<NPDisk::TDriveData> FindDeviceBySerialNumber(const char *folder, const TString& serial, std::regex device_regex) { TVector<NPDisk::TDriveData> devicesFound = ListDevices(folder, serial, device_regex); @@ -125,8 +130,8 @@ TVector<NPDisk::TDriveData> ListAllDevices() { return ListDevices("/dev", {}, std::regex(".*")); } -TVector<NPDisk::TDriveData> ListDevicesWithPartlabel() { - return ListDevices("/dev/disk/by-partlabel", "", kikimrDevice); +TVector<NPDisk::TDriveData> ListDevicesWithPartlabel(TStringStream& details) { + return ListDevices("/dev/disk/by-partlabel", "", kikimrDevice, details); } std::optional<NPDisk::TDriveData> FindDeviceBySerialNumber(const TString& serial, bool partlabelOnly) { diff --git a/ydb/library/pdisk_io/file_params_win.cpp b/ydb/library/pdisk_io/file_params_win.cpp index 9da2d2bcb8c..e344a8c93b1 100644 --- a/ydb/library/pdisk_io/file_params_win.cpp +++ b/ydb/library/pdisk_io/file_params_win.cpp @@ -38,7 +38,7 @@ std::optional<NPDisk::TDriveData> FindDeviceBySerialNumber(const TString& /*seri return {}; } -TVector<NPDisk::TDriveData> ListDevicesWithPartlabel() { +TVector<NPDisk::TDriveData> ListDevicesWithPartlabel(TStringStream& /*details*/) { return {}; } |