aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorsenya0x5f <senya0x5f@yandex-team.com>2023-08-22 19:53:37 +0300
committersenya0x5f <senya0x5f@yandex-team.com>2023-08-22 20:23:16 +0300
commitb4bdd3643d5a44e1440cad0d7c724b3b3de350c4 (patch)
treeb4fb23990083a5118622b9f8708e6a72d586c06f
parent24dd18e2743431eef40c048bb9036afb2ca11d78 (diff)
downloadydb-b4bdd3643d5a44e1440cad0d7c724b3b3de350c4.tar.gz
KIKIMR-17302 Ignore bad serials, log them, monitor them
-rw-r--r--ydb/core/blobstorage/nodewarden/blobstorage_node_warden_ut.cpp123
-rw-r--r--ydb/core/blobstorage/nodewarden/node_warden_impl.cpp84
-rw-r--r--ydb/core/blobstorage/nodewarden/node_warden_impl.h33
-rw-r--r--ydb/core/blobstorage/pdisk/blobstorage_pdisk_util_ut.cpp3
-rw-r--r--ydb/library/pdisk_io/file_params.h2
-rw-r--r--ydb/library/pdisk_io/file_params_darwin.cpp2
-rw-r--r--ydb/library/pdisk_io/file_params_linux.cpp13
-rw-r--r--ydb/library/pdisk_io/file_params_win.cpp2
8 files changed, 253 insertions, 9 deletions
diff --git a/ydb/core/blobstorage/nodewarden/blobstorage_node_warden_ut.cpp b/ydb/core/blobstorage/nodewarden/blobstorage_node_warden_ut.cpp
index e8aaee7dd39..8e71f77d277 100644
--- a/ydb/core/blobstorage/nodewarden/blobstorage_node_warden_ut.cpp
+++ b/ydb/core/blobstorage/nodewarden/blobstorage_node_warden_ut.cpp
@@ -11,11 +11,13 @@
#include <ydb/core/base/statestorage_impl.h>
#include <ydb/core/blobstorage/crypto/default.h>
#include <ydb/core/blobstorage/nodewarden/node_warden.h>
+#include <ydb/core/blobstorage/nodewarden/node_warden_impl.h>
#include <ydb/core/blobstorage/base/blobstorage_events.h>
#include <ydb/core/blobstorage/pdisk/blobstorage_pdisk_tools.h>
#include <ydb/core/blobstorage/pdisk/blobstorage_pdisk_ut_http_request.h>
#include <ydb/core/mind/bscontroller/bsc.h>
#include <ydb/core/mind/local.h>
+#include <ydb/core/util/testactorsys.h>
#include <ydb/library/pdisk_io/sector_map.h>
#include <util/random/entropy.h>
@@ -483,6 +485,127 @@ Y_UNIT_TEST_SUITE(TBlobStorageWardenTest) {
BlockGroup(runtime, sender0, tabletId, groupId, generation++, true, NKikimrProto::EReplyStatus::NO_GROUP);
}
+ CUSTOM_UNIT_TEST(TestFilterBadSerials) {
+ TTestActorSystem runtime(1);
+ runtime.Start();
+
+ TIntrusivePtr<TNodeWardenConfig> nodeWardenConfig(new TNodeWardenConfig(static_cast<IPDiskServiceFactory*>(new TRealPDiskServiceFactory())));
+
+ IActor* ac = CreateBSNodeWarden(nodeWardenConfig.Release());
+
+ TActorId nodeWarden = runtime.Register(ac, 1);
+
+ runtime.WrapInActorContext(nodeWarden, [](IActor* wardenActor) {
+ auto vectorsEqual = [](const TVector<TString>& vec1, const TVector<TString>& vec2) {
+ TVector<TString> sortedVec1 = vec1;
+ TVector<TString> sortedVec2 = vec2;
+
+ std::sort(sortedVec1.begin(), sortedVec1.end());
+ std::sort(sortedVec2.begin(), sortedVec2.end());
+
+ return sortedVec1 == sortedVec2;
+ };
+
+ auto checkHasOnlyGoodDrive = [](TVector<NPDisk::TDriveData>& drives) {
+ UNIT_ASSERT_EQUAL(1, drives.size());
+ UNIT_ASSERT_EQUAL(drives[0].Path, "/good1");
+ };
+
+ NStorage::TNodeWarden& warden = *dynamic_cast<NStorage::TNodeWarden*>(wardenActor);
+
+ NPDisk::TDriveData goodDrive1;
+ goodDrive1.Path = "/good1";
+ goodDrive1.SerialNumber = "FOOBAR";
+
+ NPDisk::TDriveData goodDrive2;
+ goodDrive2.Path = "/good2";
+ goodDrive2.SerialNumber = "BARFOO";
+
+ NPDisk::TDriveData badDrive1;
+ badDrive1.Path = "/bad1";
+ char s[] = {50, 51, 52, -128}; // Non-ASCII character -128.
+ badDrive1.SerialNumber = TString(s);
+
+ NPDisk::TDriveData badDrive2;
+ badDrive2.Path = "/bad2";
+ badDrive2.SerialNumber = "NOT\tGOOD"; // Non-printable character \t.
+
+ NPDisk::TDriveData badDrive3;
+ badDrive3.Path = "/bad3";
+ badDrive3.SerialNumber = TString(101, 'F'); // Size exceeds 100.
+
+ NPDisk::TDriveData badDrive4;
+ badDrive4.Path = "/bad4";
+ badDrive4.SerialNumber = "NOTGOODEITHER";
+ badDrive4.SerialNumber[5] = '\0'; // Unexpected null-terminator.
+
+ TStringStream details;
+
+ // Check for zero drives.
+ {
+ TVector<NPDisk::TDriveData> drives;
+ warden.RemoveDrivesWithBadSerialsAndReport(drives, details);
+
+ UNIT_ASSERT_EQUAL(0, drives.size());
+ UNIT_ASSERT_EQUAL(0, warden.DrivePathCounterKeys().size());
+ }
+
+ // If a drive is not present in a subsequent call, then it is removed from a counters map.
+ // We check both serial number validator and also that counters are removed for missing drives.
+ {
+ TVector<NPDisk::TDriveData> drives = {goodDrive1, badDrive1};
+ warden.RemoveDrivesWithBadSerialsAndReport(drives, details);
+
+ checkHasOnlyGoodDrive(drives);
+ UNIT_ASSERT(vectorsEqual(warden.DrivePathCounterKeys(), {"/good1", "/bad1"}));
+ }
+ {
+ TVector<NPDisk::TDriveData> drives = {goodDrive1, badDrive2};
+ warden.RemoveDrivesWithBadSerialsAndReport(drives, details);
+
+ checkHasOnlyGoodDrive(drives);
+ UNIT_ASSERT(vectorsEqual(warden.DrivePathCounterKeys(), {"/good1", "/bad2"}));
+ }
+ {
+ TVector<NPDisk::TDriveData> drives = {goodDrive1, badDrive3};
+ warden.RemoveDrivesWithBadSerialsAndReport(drives, details);
+
+ checkHasOnlyGoodDrive(drives);
+ UNIT_ASSERT(vectorsEqual(warden.DrivePathCounterKeys(), {"/good1", "/bad3"}));
+ }
+ {
+ TVector<NPDisk::TDriveData> drives = {goodDrive1, badDrive4};
+ warden.RemoveDrivesWithBadSerialsAndReport(drives, details);
+
+ checkHasOnlyGoodDrive(drives);
+ UNIT_ASSERT(vectorsEqual(warden.DrivePathCounterKeys(), {"/good1", "/bad4"}));
+ }
+ {
+ TVector<NPDisk::TDriveData> drives = {goodDrive1, goodDrive2, badDrive4};
+ warden.RemoveDrivesWithBadSerialsAndReport(drives, details);
+
+ UNIT_ASSERT_EQUAL(2, drives.size());
+ UNIT_ASSERT(vectorsEqual(warden.DrivePathCounterKeys(), {"/good1", "/good2", "/bad4"}));
+ }
+ // Check that good drives can also be removed from counters map.
+ {
+ TVector<NPDisk::TDriveData> drives = {goodDrive1, badDrive4};
+ warden.RemoveDrivesWithBadSerialsAndReport(drives, details);
+
+ checkHasOnlyGoodDrive(drives);
+ UNIT_ASSERT(vectorsEqual(warden.DrivePathCounterKeys(), {"/good1", "/bad4"}));
+ }
+ // Check that everything is removed if there are no drives.
+ {
+ TVector<NPDisk::TDriveData> drives;
+ warden.RemoveDrivesWithBadSerialsAndReport(drives, details);
+
+ UNIT_ASSERT_EQUAL(0, drives.size());
+ UNIT_ASSERT_EQUAL(0, warden.DrivePathCounterKeys().size());
+ }
+ });
+ }
+
CUSTOM_UNIT_TEST(TestSendToInvalidGroupId) {
TTestBasicRuntime runtime(1, false);
Setup(runtime, "", nullptr);
diff --git a/ydb/core/blobstorage/nodewarden/node_warden_impl.cpp b/ydb/core/blobstorage/nodewarden/node_warden_impl.cpp
index eb0edf6a5cd..1c3f483f9de 100644
--- a/ydb/core/blobstorage/nodewarden/node_warden_impl.cpp
+++ b/ydb/core/blobstorage/nodewarden/node_warden_impl.cpp
@@ -7,8 +7,88 @@
using namespace NKikimr;
using namespace NStorage;
+void TNodeWarden::RemoveDrivesWithBadSerialsAndReport(TVector<NPDisk::TDriveData>& drives, TStringStream& details) {
+ // Serial number's size definitely won't exceed this number of bytes.
+ size_t maxSerialSizeInBytes = 100;
+
+ auto isValidSerial = [maxSerialSizeInBytes](TString& serial) {
+ if (serial.Size() > maxSerialSizeInBytes) {
+ // Not sensible size.
+ return false;
+ }
+
+ // Check if serial number contains only ASCII characters.
+ for (size_t i = 0; i < serial.Size(); ++i) {
+ i8 c = serial[i];
+
+ if (c <= 0) {
+ // Encountered null terminator earlier than expected or non-ASCII character.
+ return false;
+ }
+
+ if (!isprint(c)) {
+ // Encountered non-printable character.
+ return false;
+ }
+ }
+
+ return true;
+ };
+
+ std::unordered_set<TString> drivePaths;
+
+ for (const auto& drive : drives) {
+ drivePaths.insert(drive.Path);
+ }
+
+ // Remove counters for drives that are no longer present.
+ for (auto countersIt = ByPathDriveCounters.begin(); countersIt != ByPathDriveCounters.end();) {
+ if (drivePaths.find(countersIt->first) == drivePaths.end()) {
+ countersIt = ByPathDriveCounters.erase(countersIt);
+ } else {
+ countersIt++;
+ }
+ }
+
+ // Prepare removal of drives with invalid serials.
+ auto toRemove = std::remove_if(drives.begin(), drives.end(), [&isValidSerial](auto& driveData) {
+ TString& serial = driveData.SerialNumber;
+
+ return !isValidSerial(serial);
+ });
+
+ // Add counters for every drive path.
+ for (auto it = drives.begin(); it != toRemove; ++it) {
+ TString& path = it->Path;
+ ByPathDriveCounters.try_emplace(path, AppData()->Counters, path);
+ }
+
+ // And for drives with invalid serials log serial and report to the monitoring.
+ for (auto it = toRemove; it != drives.end(); ++it) {
+ TString& serial = it->SerialNumber;
+ TString& path = it->Path;
+
+ auto [mapIt, _] = ByPathDriveCounters.try_emplace(path, AppData()->Counters, path);
+
+ // Cut string in case it exceeds max size.
+ size_t size = std::min(serial.Size(), maxSerialSizeInBytes);
+
+ // Encode in case it contains weird symbols.
+ TString encoded = Base64Encode(serial.substr(0, size));
+
+ // Output bad serial number in base64 encoding.
+ STLOG(PRI_WARN, BS_NODE, NW03, "Bad serial number", (Path, path), (SerialBase64, encoded.Quote()), (Details, details.Str()));
+
+ mapIt->second.BadSerialsRead->Inc();
+ }
+
+ // Remove drives with invalid serials.
+ drives.erase(toRemove, drives.end());
+}
+
TVector<NPDisk::TDriveData> TNodeWarden::ListLocalDrives() {
- TVector<NPDisk::TDriveData> drives = ListDevicesWithPartlabel();
+ TStringStream details;
+ TVector<NPDisk::TDriveData> drives = ListDevicesWithPartlabel(details);
try {
TString raw = TFileInput(MockDevicesPath).ReadAll();
@@ -29,6 +109,8 @@ TVector<NPDisk::TDriveData> TNodeWarden::ListLocalDrives() {
return lhs.Path < rhs.Path;
});
+ RemoveDrivesWithBadSerialsAndReport(drives, details);
+
return drives;
}
diff --git a/ydb/core/blobstorage/nodewarden/node_warden_impl.h b/ydb/core/blobstorage/nodewarden/node_warden_impl.h
index ec56a334f47..dbe59b5d3c4 100644
--- a/ydb/core/blobstorage/nodewarden/node_warden_impl.h
+++ b/ydb/core/blobstorage/nodewarden/node_warden_impl.h
@@ -63,12 +63,25 @@ namespace NKikimr::NStorage {
{}
};
+ struct TDrivePathCounters {
+ ::NMonitoring::TDynamicCounters::TCounterPtr BadSerialsRead;
+
+ TDrivePathCounters(const TIntrusivePtr<::NMonitoring::TDynamicCounters>& counters, const TString& path) {
+ auto driveGroup = GetServiceCounters(counters, "pdisks")->GetSubgroup("path", path);
+
+ BadSerialsRead = driveGroup->GetCounter("BadSerialsRead");
+ }
+ };
+
class TNodeWarden : public TActorBootstrapped<TNodeWarden> {
TIntrusivePtr<TNodeWardenConfig> Cfg;
TIntrusivePtr<TDsProxyNodeMon> DsProxyNodeMon;
TActorId DsProxyNodeMonActor;
TIntrusivePtr<TDsProxyPerPoolCounters> DsProxyPerPoolCounters;
+ // Counters for drives by drive path.
+ TMap<TString, TDrivePathCounters> ByPathDriveCounters;
+
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
ui32 LocalNodeId; // NodeId for local node
@@ -158,8 +171,28 @@ namespace NKikimr::NStorage {
void StartVirtualGroupAgent(ui32 groupId);
void StartStaticProxies();
+ /**
+ * Removes drives with bad serial numbers and reports them to monitoring.
+ *
+ * This method scans a vector of drive data and checks for drives with bad serial numbers.
+ * Drives with bad serial numbers are removed from the vector and reported to the monitoring.
+ *
+ * @param drives A vector of data representing disk drives.
+ * @param details A string stream with details about drives.
+ */
+ void RemoveDrivesWithBadSerialsAndReport(TVector<NPDisk::TDriveData>& drives, TStringStream& details);
TVector<NPDisk::TDriveData> ListLocalDrives();
+ TVector<TString> DrivePathCounterKeys() const {
+ TVector<TString> keys;
+
+ for (const auto& [key, _] : ByPathDriveCounters) {
+ keys.push_back(key);
+ }
+
+ return keys;
+ }
+
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Pipe management
diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_util_ut.cpp b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_util_ut.cpp
index 971947d374b..060d87243c3 100644
--- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_util_ut.cpp
+++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_util_ut.cpp
@@ -453,7 +453,8 @@ void TestPayloadOffset(ui64 firstSector, ui64 lastSector, ui64 currentSector, ui
}
Y_UNIT_TEST(TestDeviceList) {
- for(const NPDisk::TDriveData& data : ListDevicesWithPartlabel()) {
+ TStringStream details;
+ for(const NPDisk::TDriveData& data : ListDevicesWithPartlabel(details)) {
Cout << "data# " << data.ToString(false) << Endl;
}
}
diff --git a/ydb/library/pdisk_io/file_params.h b/ydb/library/pdisk_io/file_params.h
index 46dab2109ab..0199d341ab3 100644
--- a/ydb/library/pdisk_io/file_params.h
+++ b/ydb/library/pdisk_io/file_params.h
@@ -11,7 +11,7 @@ void DetectFileParameters(TString path, ui64 &outDiskSizeBytes, bool &outIsBlock
std::optional<NPDisk::TDriveData> FindDeviceBySerialNumber(const TString& serial, bool partlabelOnly);
-TVector<NPDisk::TDriveData> ListDevicesWithPartlabel();
+TVector<NPDisk::TDriveData> ListDevicesWithPartlabel(TStringStream& details);
TVector<NPDisk::TDriveData> ListAllDevices();
diff --git a/ydb/library/pdisk_io/file_params_darwin.cpp b/ydb/library/pdisk_io/file_params_darwin.cpp
index d300a153452..a33465da0da 100644
--- a/ydb/library/pdisk_io/file_params_darwin.cpp
+++ b/ydb/library/pdisk_io/file_params_darwin.cpp
@@ -56,7 +56,7 @@ std::optional<NPDisk::TDriveData> FindDeviceBySerialNumber(const TString& /*seri
return {};
}
-TVector<NPDisk::TDriveData> ListDevicesWithPartlabel() {
+TVector<NPDisk::TDriveData> ListDevicesWithPartlabel(TStringStream& /*details*/) {
return {};
}
diff --git a/ydb/library/pdisk_io/file_params_linux.cpp b/ydb/library/pdisk_io/file_params_linux.cpp
index 98c2ee62e5d..fd02a7a836e 100644
--- a/ydb/library/pdisk_io/file_params_linux.cpp
+++ b/ydb/library/pdisk_io/file_params_linux.cpp
@@ -83,7 +83,7 @@ static TVector<NPDisk::TDriveData> FilterOnlyUniqueSerial(TVector<NPDisk::TDrive
return result;
}
-static TVector<NPDisk::TDriveData> ListDevices(const char *folder, const TString& serial, std::regex device_regex) {
+static TVector<NPDisk::TDriveData> ListDevices(const char *folder, const TString& serial, std::regex device_regex, TStringStream& details) {
TFsPath path(folder);
TVector<TFsPath> children;
try {
@@ -94,17 +94,22 @@ static TVector<NPDisk::TDriveData> ListDevices(const char *folder, const TString
TVector<NPDisk::TDriveData> devicesFound;
for (const auto& child : children) {
if (std::regex_match(child.GetName().c_str(), device_regex)) {
- TStringStream details;
std::optional<NPDisk::TDriveData> data = NPDisk::GetDriveData(child.GetPath(), &details);
if (data && (!serial || data->SerialNumber == serial)) {
devicesFound.push_back(*data);
}
+ details << "#";
}
}
return FilterOnlyUniqueSerial(devicesFound);
}
+static TVector<NPDisk::TDriveData> ListDevices(const char *folder, const TString& serial, std::regex device_regex) {
+ TStringStream details;
+ return ListDevices(folder, serial, device_regex, details);
+}
+
static std::optional<NPDisk::TDriveData> FindDeviceBySerialNumber(const char *folder, const TString& serial,
std::regex device_regex) {
TVector<NPDisk::TDriveData> devicesFound = ListDevices(folder, serial, device_regex);
@@ -125,8 +130,8 @@ TVector<NPDisk::TDriveData> ListAllDevices() {
return ListDevices("/dev", {}, std::regex(".*"));
}
-TVector<NPDisk::TDriveData> ListDevicesWithPartlabel() {
- return ListDevices("/dev/disk/by-partlabel", "", kikimrDevice);
+TVector<NPDisk::TDriveData> ListDevicesWithPartlabel(TStringStream& details) {
+ return ListDevices("/dev/disk/by-partlabel", "", kikimrDevice, details);
}
std::optional<NPDisk::TDriveData> FindDeviceBySerialNumber(const TString& serial, bool partlabelOnly) {
diff --git a/ydb/library/pdisk_io/file_params_win.cpp b/ydb/library/pdisk_io/file_params_win.cpp
index 9da2d2bcb8c..e344a8c93b1 100644
--- a/ydb/library/pdisk_io/file_params_win.cpp
+++ b/ydb/library/pdisk_io/file_params_win.cpp
@@ -38,7 +38,7 @@ std::optional<NPDisk::TDriveData> FindDeviceBySerialNumber(const TString& /*seri
return {};
}
-TVector<NPDisk::TDriveData> ListDevicesWithPartlabel() {
+TVector<NPDisk::TDriveData> ListDevicesWithPartlabel(TStringStream& /*details*/) {
return {};
}