diff options
author | alexvru <alexvru@ydb.tech> | 2022-08-04 20:30:05 +0300 |
---|---|---|
committer | alexvru <alexvru@ydb.tech> | 2022-08-04 20:30:05 +0300 |
commit | 80d3ddafcb902ea26c7588431dcb62923ac86c2e (patch) | |
tree | 0880e8f04335934259c65450ec5de9c02cc4509a | |
parent | 9b0d93190b58ed55a168e9636f71d4518247c124 (diff) | |
download | ydb-80d3ddafcb902ea26c7588431dcb62923ac86c2e.tar.gz |
Refactor the way virtual group BlobDepot tablet is being created -- use Hive directly instead of Schemeshard
24 files changed, 340 insertions, 443 deletions
diff --git a/ydb/core/blob_depot/agent.cpp b/ydb/core/blob_depot/agent.cpp index cd8137f0af8..cc731030d48 100644 --- a/ydb/core/blob_depot/agent.cpp +++ b/ydb/core/blob_depot/agent.cpp @@ -70,7 +70,9 @@ namespace NKikimr::NBlobDepot { } } - record->MutableDecommittingGroups()->CopyFrom(Config.GetDecommittingGroups()); + if (Config.HasDecommitGroupId()) { + record->SetDecommitGroupId(Config.GetDecommitGroupId()); + } TActivationContext::Send(response.release()); } diff --git a/ydb/core/blob_depot/assimilator.cpp b/ydb/core/blob_depot/assimilator.cpp index 5b2c9790e9b..c871995c407 100644 --- a/ydb/core/blob_depot/assimilator.cpp +++ b/ydb/core/blob_depot/assimilator.cpp @@ -10,13 +10,15 @@ namespace NKikimr::NBlobDepot { const ui32 GroupId; const NKikimrBlobDepot::TBlobDepotConfig Config; + const ui64 TabletId; TActorId BlobDepotId; TIntrusivePtr<TBlobStorageGroupInfo> Info; public: - TGroupAssimilator(ui32 groupId, const NKikimrBlobDepot::TBlobDepotConfig& config) - : GroupId(groupId) + TGroupAssimilator(const NKikimrBlobDepot::TBlobDepotConfig& config, ui64 tabletId) + : GroupId(config.GetDecommitGroupId()) , Config(config) + , TabletId(tabletId) { Y_VERIFY(Config.GetOperationMode() == NKikimrBlobDepot::EOperationMode::VirtualGroup); } @@ -85,13 +87,8 @@ namespace NKikimr::NBlobDepot { if (group.GetGroupID() == GroupId) { if (group.GetEntityStatus() == NKikimrBlobStorage::EEntityStatus::DESTROY) { return AbortWithError("the group being decommitted was destroyed"); - } else if (!group.HasAssimilatorGroupId()) { - return AbortWithError("the group being decommitted is not in assimilation mode"); - } else if (group.HasBlobDepotId()) { - const TString msg = "the group being decommitted is a virtual one"; - Y_VERIFY_DEBUG(false, "%s", msg.data()); - STLOG(PRI_CRIT, BLOB_DEPOT, BDT35, msg, (GroupId, GroupId)); - return AbortWithError(msg); + } else if (!group.HasBlobDepotId() || group.GetBlobDepotId() != TabletId) { + return AbortWithError("inconsistent decommission state"); } else { Info = TBlobStorageGroupInfo::Parse(group, nullptr, nullptr); StartAssimilation(); @@ -136,35 +133,20 @@ namespace NKikimr::NBlobDepot { void AbortWithError(TString error) { STLOG(PRI_ERROR, BLOB_DEPOT, BDT34, "failed to assimilate group", (GroupId, GroupId), (Error, error)); - TActivationContext::Send(new IEventHandle(TEvents::TSystem::Gone, 0, BlobDepotId, SelfId(), nullptr, GroupId)); + TActivationContext::Send(new IEventHandle(TEvents::TSystem::Gone, 0, BlobDepotId, SelfId(), nullptr, 0)); PassAway(); } }; - void TBlobDepot::StartGroupAssimilators() { - for (const ui32 groupId : Config.GetDecommittingGroups()) { - if (Config.GetOperationMode() != NKikimrBlobDepot::EOperationMode::VirtualGroup) { - STLOG(PRI_CRIT, BLOB_DEPOT, BDT36, "incorrect operating mode of BlobDepot", (TabletId, TabletID())); - Y_VERIFY_DEBUG(false, "incorrect operating mode of BlobDepot"); - return; - } - - StartGroupAssimilator(groupId); - } - } - - void TBlobDepot::StartGroupAssimilator(ui32 groupId) { - if (RunningGroupAssimilators.contains(groupId)) { - return; + void TBlobDepot::StartGroupAssimilator() { + if (!RunningGroupAssimilator && Config.HasDecommitGroupId()) { + RunningGroupAssimilator = Register(new TGroupAssimilator(Config, TabletID())); } - - RunningGroupAssimilators[groupId] = Register(new TGroupAssimilator(groupId, Config)); } void TBlobDepot::HandleGone(TAutoPtr<IEventHandle> ev) { - if (const auto it = RunningGroupAssimilators.find(ev->Cookie); it != RunningGroupAssimilators.end()) { - Y_VERIFY(it->second == ev->Sender); - RunningGroupAssimilators.erase(it); + if (ev->Sender == RunningGroupAssimilator) { + RunningGroupAssimilator = {}; } else { Y_FAIL("unexpected event"); } diff --git a/ydb/core/blob_depot/blob_depot.cpp b/ydb/core/blob_depot/blob_depot.cpp index a93015f6522..6cb1a811993 100644 --- a/ydb/core/blob_depot/blob_depot.cpp +++ b/ydb/core/blob_depot/blob_depot.cpp @@ -55,8 +55,9 @@ namespace NKikimr::NBlobDepot { } void TBlobDepot::PassAway() { - for (const auto& [_, actorId] : RunningGroupAssimilators) { - TActivationContext::Send(new IEventHandle(TEvents::TSystem::Poison, 0, actorId, SelfId(), nullptr, 0)); + if (RunningGroupAssimilator) { + TActivationContext::Send(new IEventHandle(TEvents::TSystem::Poison, 0, RunningGroupAssimilator, SelfId(), + nullptr, 0)); } TActor::PassAway(); diff --git a/ydb/core/blob_depot/blob_depot_tablet.h b/ydb/core/blob_depot/blob_depot_tablet.h index 55bd284f0cd..10586eb63b8 100644 --- a/ydb/core/blob_depot/blob_depot_tablet.h +++ b/ydb/core/blob_depot/blob_depot_tablet.h @@ -219,7 +219,7 @@ namespace NKikimr::NBlobDepot { void StartOperation() { InitChannelKinds(); - StartGroupAssimilators(); + StartGroupAssimilator(); } void OnDetach(const TActorContext&) override { @@ -308,13 +308,12 @@ namespace NKikimr::NBlobDepot { //////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Group assimilation - THashMap<ui32, TActorId> RunningGroupAssimilators; + TActorId RunningGroupAssimilator; class TGroupAssimilator; class TGroupAssimilatorFetchMachine; - void StartGroupAssimilators(); - void StartGroupAssimilator(ui32 groupId); + void StartGroupAssimilator(); void HandleGone(TAutoPtr<IEventHandle> ev); void Handle(TEvAssimilatedData::TPtr ev); }; diff --git a/ydb/core/blobstorage/dsproxy/dsproxy_impl.h b/ydb/core/blobstorage/dsproxy/dsproxy_impl.h index 6a8e8940f0c..fd4b5e5b5e4 100644 --- a/ydb/core/blobstorage/dsproxy/dsproxy_impl.h +++ b/ydb/core/blobstorage/dsproxy/dsproxy_impl.h @@ -260,8 +260,8 @@ class TBlobStorageGroupProxy : public TActorBootstrapped<TBlobStorageGroupProxy> template<typename TEvent> void HandleCheckAssimilator(TAutoPtr<TEventHandle<TEvent>>& ev) { - if (Info->AssimilatorGroupId) { - TActivationContext::Send(ev->Forward(MakeBlobStorageProxyID(*Info->AssimilatorGroupId))); + if (/*Info->AssimilatorGroupId*/ false) { + //TActivationContext::Send(ev->Forward(MakeBlobStorageProxyID(*Info->AssimilatorGroupId))); } else { return HandleNormal(ev); } diff --git a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp index 409d79173da..928dbfd770d 100644 --- a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp +++ b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp @@ -660,8 +660,8 @@ TIntrusivePtr<TBlobStorageGroupInfo> TBlobStorageGroupInfo::Parse(const NKikimrB if (group.HasBlobDepotId()) { res->BlobDepotId = group.GetBlobDepotId(); } - if (group.HasAssimilatorGroupId()) { - res->AssimilatorGroupId = group.GetAssimilatorGroupId(); + if (group.HasDecommitStatus()) { + res->DecommitStatus = group.GetDecommitStatus(); } // process encryption parameters diff --git a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h index 1852dc8391a..186731fb23d 100644 --- a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h +++ b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h @@ -416,7 +416,7 @@ public: // virtual group BlobDepot tablet id std::optional<ui64> BlobDepotId; // assimilating group id - std::optional<ui32> AssimilatorGroupId; + NKikimrBlobStorage::TGroupDecommitStatus::E DecommitStatus = NKikimrBlobStorage::TGroupDecommitStatus::NONE; // origin of the group info content std::optional<NKikimrBlobStorage::TGroupInfo> Group; diff --git a/ydb/core/blobstorage/ut_blobstorage/blob_depot.cpp b/ydb/core/blobstorage/ut_blobstorage/blob_depot.cpp index 0c1cd18d596..a207f76964b 100644 --- a/ydb/core/blobstorage/ut_blobstorage/blob_depot.cpp +++ b/ydb/core/blobstorage/ut_blobstorage/blob_depot.cpp @@ -18,9 +18,9 @@ Y_UNIT_TEST_SUITE(BlobDepot) { NKikimrBlobStorage::TConfigRequest request; auto *cmd = request.AddCommand()->MutableAllocateVirtualGroup(); - cmd->SetVirtualGroupPool("vg"); + cmd->SetName("vg"); + cmd->SetHiveId(1); cmd->SetStoragePoolName(env.StoragePoolName); - cmd->SetParentDir("/Root"); cmd->SetBlobDepotId(env.Settings.BlobDepotId); auto *prof = cmd->AddChannelProfiles(); prof->SetStoragePoolKind(""); diff --git a/ydb/core/blobstorage/vdisk/skeleton/blobstorage_skeleton.cpp b/ydb/core/blobstorage/vdisk/skeleton/blobstorage_skeleton.cpp index e41fb7831cc..480ba96c1a2 100644 --- a/ydb/core/blobstorage/vdisk/skeleton/blobstorage_skeleton.cpp +++ b/ydb/core/blobstorage/vdisk/skeleton/blobstorage_skeleton.cpp @@ -97,12 +97,24 @@ namespace NKikimr { // Some stuff to handle a case when we can't accept TEvVPut requests // because of (fresh) compaction overload //////////////////////////////////////////////////////////////////////// + static bool BlockWrites(NKikimrBlobStorage::TGroupDecommitStatus::E status) { + switch (status) { + case NKikimrBlobStorage::TGroupDecommitStatus::NONE: + case NKikimrBlobStorage::TGroupDecommitStatus::PENDING: + return false; + + case NKikimrBlobStorage::TGroupDecommitStatus::IN_PROGRESS: + case NKikimrBlobStorage::TGroupDecommitStatus::DONE: + return true; + } + } + template<typename TEvent> bool CheckIfWriteAllowed(TAutoPtr<TEventHandle<TEvent>>& ev, const TActorContext& ctx) { if (Config->BaseInfo.DonorMode) { ReplyError(NKikimrProto::ERROR, "disk is in donor mode", ev, ctx, TAppData::TimeProvider->Now()); - } else if (GInfo->AssimilatorGroupId) { - ReplyError(NKikimrProto::ERROR, "group is being assimilated", ev, ctx, TAppData::TimeProvider->Now()); + } else if (BlockWrites(GInfo->DecommitStatus)) { + ReplyError(NKikimrProto::ERROR, "group is being decommitted", ev, ctx, TAppData::TimeProvider->Now()); } else { return true; } diff --git a/ydb/core/mind/bscontroller/CMakeLists.txt b/ydb/core/mind/bscontroller/CMakeLists.txt index d55dd480e20..6fc40a4d23d 100644 --- a/ydb/core/mind/bscontroller/CMakeLists.txt +++ b/ydb/core/mind/bscontroller/CMakeLists.txt @@ -17,6 +17,7 @@ target_link_libraries(core-mind-bscontroller PUBLIC ydb-core-blobstorage core-blobstorage-base core-blobstorage-groupinfo + ydb-core-blob_depot core-engine-minikql ydb-core-protos core-sys_view-common diff --git a/ydb/core/mind/bscontroller/bsc.cpp b/ydb/core/mind/bscontroller/bsc.cpp index fb8cdaa58a9..02c57a38f8a 100644 --- a/ydb/core/mind/bscontroller/bsc.cpp +++ b/ydb/core/mind/bscontroller/bsc.cpp @@ -57,42 +57,42 @@ TBlobStorageController::TVSlotInfo::TVSlotInfo(TVSlotId vSlotId, TPDiskInfo *pdi } void TBlobStorageController::TGroupInfo::CalculateGroupStatus() { + Status = {}; + if (VirtualGroupState) { if (VirtualGroupState == NKikimrBlobStorage::EVirtualGroupState::WORKING) { - Status = {NKikimrBlobStorage::TGroupStatus::FULL, NKikimrBlobStorage::TGroupStatus::FULL}; + Status.MakeWorst(NKikimrBlobStorage::TGroupStatus::FULL, NKikimrBlobStorage::TGroupStatus::FULL); } else { - Status = {NKikimrBlobStorage::TGroupStatus::DISINTEGRATED, NKikimrBlobStorage::TGroupStatus::DISINTEGRATED}; + Status.MakeWorst(NKikimrBlobStorage::TGroupStatus::DISINTEGRATED, NKikimrBlobStorage::TGroupStatus::DISINTEGRATED); } - return; } - TBlobStorageGroupInfo::TGroupVDisks failed(Topology.get()); - TBlobStorageGroupInfo::TGroupVDisks failedByPDisk(Topology.get()); - for (const TVSlotInfo *slot : VDisksInGroup) { - if (!slot->IsReady) { - failed |= {Topology.get(), slot->GetShortVDiskId()}; - } else if (!slot->PDisk->HasGoodExpectedStatus()) { - failedByPDisk |= {Topology.get(), slot->GetShortVDiskId()}; + if (VDisksInGroup) { + TBlobStorageGroupInfo::TGroupVDisks failed(Topology.get()); + TBlobStorageGroupInfo::TGroupVDisks failedByPDisk(Topology.get()); + for (const TVSlotInfo *slot : VDisksInGroup) { + if (!slot->IsReady) { + failed |= {Topology.get(), slot->GetShortVDiskId()}; + } else if (!slot->PDisk->HasGoodExpectedStatus()) { + failedByPDisk |= {Topology.get(), slot->GetShortVDiskId()}; + } } + auto deriveStatus = [&](const auto& failed) { + auto& checker = *Topology->QuorumChecker; + if (!failed.GetNumSetItems()) { // all disks of group are operational + return NKikimrBlobStorage::TGroupStatus::FULL; + } else if (!checker.CheckFailModelForGroup(failed)) { // fail model exceeded + return NKikimrBlobStorage::TGroupStatus::DISINTEGRATED; + } else if (checker.IsDegraded(failed)) { // group degraded + return NKikimrBlobStorage::TGroupStatus::DEGRADED; + } else if (failed.GetNumSetItems()) { // group partially available, but not degraded + return NKikimrBlobStorage::TGroupStatus::PARTIAL; + } else { + Y_FAIL("unexpected case"); + } + }; + Status.MakeWorst(deriveStatus(failed), deriveStatus(failed | failedByPDisk)); } - auto deriveStatus = [&](const auto& failed) { - auto& checker = *Topology->QuorumChecker; - if (!failed.GetNumSetItems()) { // all disks of group are operational - return NKikimrBlobStorage::TGroupStatus::FULL; - } else if (!checker.CheckFailModelForGroup(failed)) { // fail model exceeded - return NKikimrBlobStorage::TGroupStatus::DISINTEGRATED; - } else if (checker.IsDegraded(failed)) { // group degraded - return NKikimrBlobStorage::TGroupStatus::DEGRADED; - } else if (failed.GetNumSetItems()) { // group partially available, but not degraded - return NKikimrBlobStorage::TGroupStatus::PARTIAL; - } else { - Y_FAIL("unexpected case"); - } - }; - Status = { - deriveStatus(failed), - deriveStatus(failed | failedByPDisk), - }; } void TBlobStorageController::OnActivateExecutor(const TActorContext&) { diff --git a/ydb/core/mind/bscontroller/cmds_drive_status.cpp b/ydb/core/mind/bscontroller/cmds_drive_status.cpp index 25fe500b2c2..c41f85b5a89 100644 --- a/ydb/core/mind/bscontroller/cmds_drive_status.cpp +++ b/ydb/core/mind/bscontroller/cmds_drive_status.cpp @@ -174,7 +174,7 @@ namespace NKikimr::NBsController { Y_VERIFY(success); driveInfoNew->PDiskConfig = config; - STLOG(PRI_NOTICE, BS_CONTROLLER_AUDIT, BSCA06, "AddDriveSerial", (Serial, newSerial), (BoxId, boxId)); + STLOG(PRI_NOTICE, BS_CONTROLLER_AUDIT, BSCA00, "AddDriveSerial", (Serial, newSerial), (BoxId, boxId)); } void TBlobStorageController::TConfigState::ExecuteStep(const NKikimrBlobStorage::TRemoveDriveSerial& cmd, diff --git a/ydb/core/mind/bscontroller/config.cpp b/ydb/core/mind/bscontroller/config.cpp index 0b2f820ac89..46b20dc4e4d 100644 --- a/ydb/core/mind/bscontroller/config.cpp +++ b/ydb/core/mind/bscontroller/config.cpp @@ -190,7 +190,7 @@ namespace NKikimr::NBsController { //////////////////////////////////////////////////////////////////////////////////////////////////////////// void ApplyGroupCreated(const TGroupId& /*groupId*/, const TGroupInfo &groupInfo) { - if (groupInfo.VirtualGroupState && *groupInfo.VirtualGroupState != NKikimrBlobStorage::EVirtualGroupState::WORKING) { + if (!groupInfo.VDisksInGroup && groupInfo.VirtualGroupState != NKikimrBlobStorage::EVirtualGroupState::WORKING) { return; // do not report virtual groups that are not properly created yet } @@ -330,7 +330,6 @@ namespace NKikimr::NBsController { MakeTableMerger<Schema::Box>(&Boxes, &state.Boxes.Get(), this)(txc); MakeTableMerger<Schema::BoxStoragePool>(&StoragePools, &state.StoragePools.Get(), this)(txc); MakeTableMerger<Schema::Node>(&Nodes, &state.Nodes.Get(), this)(txc); - MakeTableMerger<Schema::VirtualGroupPool>(&VirtualGroupPools, &state.VirtualGroupPools.Get(), this)(txc); // apply overlay maps to their respective tables state.PDisks.ApplyToTable(this, txc); @@ -636,8 +635,6 @@ namespace NKikimr::NBsController { Y_VERIFY(vslot->GroupId == groupId); Y_VERIFY(vslot->GroupGeneration == group.Generation); } - Y_VERIFY_DEBUG((group.DecommitStatus == NKikimrBlobStorage::EGroupDecommitStatus::NONE) == - group.AssimilatorGroupId.Empty()); }); #endif } @@ -912,7 +909,7 @@ namespace NKikimr::NBsController { pb->SetX2(x.second); } - if (!groupInfo.VirtualGroupState) { + if (groupInfo.VDisksInGroup) { group->SetErasureSpecies(groupInfo.ErasureSpecies); group->SetDeviceType(PDiskTypeToPDiskType(groupInfo.GetCommonDeviceType())); @@ -939,16 +936,17 @@ namespace NKikimr::NBsController { Serialize(domain->AddVDiskLocations(), *vslot); } - } else if (*groupInfo.VirtualGroupState == NKikimrBlobStorage::EVirtualGroupState::WORKING) { + } + + if (groupInfo.VirtualGroupState == NKikimrBlobStorage::EVirtualGroupState::WORKING) { Y_VERIFY(groupInfo.BlobDepotId); group->SetBlobDepotId(*groupInfo.BlobDepotId); - } else if (*groupInfo.VirtualGroupState == NKikimrBlobStorage::EVirtualGroupState::CREATE_FAILED) { + } else if (groupInfo.VirtualGroupState == NKikimrBlobStorage::EVirtualGroupState::CREATE_FAILED) { group->SetBlobDepotId(0); } - if (groupInfo.DecommitStatus != NKikimrBlobStorage::EGroupDecommitStatus::NONE) { - Y_VERIFY(groupInfo.AssimilatorGroupId); - group->SetAssimilatorGroupId(*groupInfo.AssimilatorGroupId); + if (groupInfo.DecommitStatus != NKikimrBlobStorage::TGroupDecommitStatus::NONE) { + group->SetDecommitStatus(groupInfo.DecommitStatus); } } diff --git a/ydb/core/mind/bscontroller/config.h b/ydb/core/mind/bscontroller/config.h index 383b15f88c4..41f4581b9f2 100644 --- a/ydb/core/mind/bscontroller/config.h +++ b/ydb/core/mind/bscontroller/config.h @@ -54,7 +54,6 @@ namespace NKikimr { // system-level configuration TOverlayMap<TPDiskId, TPDiskInfo> PDisks; TOverlayMap<TSerial, TDriveSerialInfo> DrivesSerials; - TCowHolder<TMap<Schema::VirtualGroupPool::TKey::Type, TVirtualGroupPool>> VirtualGroupPools; TCowHolder<TMap<TNodeId, TNodeInfo>> Nodes; TOverlayMap<TVSlotId, TVSlotInfo> VSlots; TOverlayMap<TGroupId, TGroupInfo> Groups; @@ -115,7 +114,6 @@ namespace NKikimr { , StoragePoolGroups(&controller.StoragePoolGroups) , PDisks(controller.PDisks) , DrivesSerials(controller.DrivesSerials) - , VirtualGroupPools(&controller.VirtualGroupPools) , Nodes(&controller.Nodes) , VSlots(controller.VSlots) , Groups(controller.GroupMap) @@ -144,7 +142,6 @@ namespace NKikimr { StoragePoolGroups.Commit(); PDisks.Commit(); DrivesSerials.Commit(); - VirtualGroupPools.Commit(); Nodes.Commit(); VSlots.Commit(); Groups.Commit(); @@ -165,8 +162,7 @@ namespace NKikimr { return HostConfigs.Changed() || Boxes.Changed() || StoragePools.Changed() || StoragePoolGroups.Changed() || PDisks.Changed() || DrivesSerials.Changed() || Nodes.Changed() || VSlots.Changed() || Groups.Changed() || IndexGroupSpeciesToGroup.Changed() || NextGroupId.Changed() || - NextStoragePoolId.Changed() || SerialManagementStage.Changed() || VirtualGroupPools.Changed() || - NextVirtualGroupId.Changed(); + NextStoragePoolId.Changed() || SerialManagementStage.Changed() || NextVirtualGroupId.Changed(); } bool NormalizeHostKey(NKikimrBlobStorage::THostKey *host) const { @@ -221,8 +217,6 @@ namespace NKikimr { } } - void InitiateGroupDecommission(ui32 virtualGroupId, ui32 groupToDecommit); - private: template<typename TCommand, typename TKey, typename TValue> static ui64 CheckGeneration(const TCommand &cmd, const TMap<TKey, TValue> &map, const TKey &id) { diff --git a/ydb/core/mind/bscontroller/defs.h b/ydb/core/mind/bscontroller/defs.h index d9a778209bf..a6e70e2ccba 100644 --- a/ydb/core/mind/bscontroller/defs.h +++ b/ydb/core/mind/bscontroller/defs.h @@ -6,6 +6,7 @@ #include <ydb/core/base/appdata.h> #include <ydb/core/base/blobstorage.h> #include <ydb/core/base/counters.h> +#include <ydb/core/base/hive.h> #include <ydb/core/base/group_stat.h> #include <ydb/core/base/services/blobstorage_service_id.h> #include <ydb/core/base/tablet_pipe.h> @@ -14,6 +15,7 @@ #include <ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h> #include <ydb/core/blobstorage/groupinfo/blobstorage_groupinfo_blobmap.h> #include <ydb/core/blobstorage/groupinfo/blobstorage_groupinfo_sets.h> +#include <ydb/core/blob_depot/events.h> #include <ydb/core/engine/minikql/flat_local_tx_factory.h> #include <ydb/core/mind/table_adapter.h> #include <ydb/core/protos/blobstorage_config.pb.h> diff --git a/ydb/core/mind/bscontroller/impl.h b/ydb/core/mind/bscontroller/impl.h index 717b9392f40..f1f5d66443b 100644 --- a/ydb/core/mind/bscontroller/impl.h +++ b/ydb/core/mind/bscontroller/impl.h @@ -494,17 +494,12 @@ public: bool PersistedDown = false; // the value stored in the database bool SeenOperational = false; - Table::DecommitStatus::Type DecommitStatus = NKikimrBlobStorage::EGroupDecommitStatus::NONE; - TMaybe<Table::AssimilatorGroupId::Type> AssimilatorGroupId; + Table::DecommitStatus::Type DecommitStatus = NKikimrBlobStorage::TGroupDecommitStatus::NONE; - TMaybe<Table::VirtualGroupPool::Type> VirtualGroupPool; + TMaybe<Table::VirtualGroupName::Type> VirtualGroupName; TMaybe<Table::VirtualGroupState::Type> VirtualGroupState; - TMaybe<Table::ParentDir::Type> ParentDir; - TMaybe<Table::Name::Type> Name; - TMaybe<Table::SchemeshardId::Type> SchemeshardId; + TMaybe<Table::HiveId::Type> HiveId; TMaybe<Table::BlobDepotConfig::Type> BlobDepotConfig; - TMaybe<Table::TxId::Type> TxId; - TMaybe<Table::PathId::Type> PathId; TMaybe<Table::BlobDepotId::Type> BlobDepotId; TMaybe<Table::ErrorReason::Type> ErrorReason; TMaybe<Table::NeedAlter::Type> NeedAlter; @@ -535,6 +530,11 @@ public: NKikimrBlobStorage::TGroupStatus::E OperatingStatus = NKikimrBlobStorage::TGroupStatus::UNKNOWN; // status derived by adding underlying PDisk status (some of them are assumed to be not working ones) NKikimrBlobStorage::TGroupStatus::E ExpectedStatus = NKikimrBlobStorage::TGroupStatus::UNKNOWN; + + void MakeWorst(NKikimrBlobStorage::TGroupStatus::E operating, NKikimrBlobStorage::TGroupStatus::E expected) { + OperatingStatus = Max(OperatingStatus, operating); + ExpectedStatus = Max(ExpectedStatus, expected); + } } Status; // group status depends on the IsReady value for every VDisk; so it has to be updated every time there is possible @@ -564,15 +564,10 @@ public: Table::MainKeyVersion, Table::SeenOperational, Table::DecommitStatus, - Table::AssimilatorGroupId, - Table::VirtualGroupPool, + Table::VirtualGroupName, Table::VirtualGroupState, - Table::ParentDir, - Table::Name, - Table::SchemeshardId, + Table::HiveId, Table::BlobDepotConfig, - Table::TxId, - Table::PathId, Table::BlobDepotId, Table::ErrorReason, Table::NeedAlter @@ -590,15 +585,10 @@ public: &TGroupInfo::MainKeyVersion, &TGroupInfo::SeenOperational, &TGroupInfo::DecommitStatus, - &TGroupInfo::AssimilatorGroupId, - &TGroupInfo::VirtualGroupPool, + &TGroupInfo::VirtualGroupName, &TGroupInfo::VirtualGroupState, - &TGroupInfo::ParentDir, - &TGroupInfo::Name, - &TGroupInfo::SchemeshardId, + &TGroupInfo::HiveId, &TGroupInfo::BlobDepotConfig, - &TGroupInfo::TxId, - &TGroupInfo::PathId, &TGroupInfo::BlobDepotId, &TGroupInfo::ErrorReason, &TGroupInfo::NeedAlter @@ -652,7 +642,8 @@ public: } bool Listable() const { - return !VirtualGroupState + return VDisksInGroup + || !VirtualGroupState || *VirtualGroupState == NKikimrBlobStorage::EVirtualGroupState::WORKING || *VirtualGroupState == NKikimrBlobStorage::EVirtualGroupState::CREATE_FAILED; } @@ -701,7 +692,7 @@ public: } void FillInGroupParameters(NKikimrBlobStorage::TEvControllerSelectGroupsResult::TGroupParameters *params) const { - if (VirtualGroupState) { + if (!VDisksInGroup) { for (auto *p : {params->MutableAssuredResources(), params->MutableCurrentResources()}) { p->SetSpace(1'000'000'000'000); p->SetIOPS(1'000); @@ -1325,28 +1316,6 @@ public: void OnClone(const THolder<TDriveSerialInfo>&) {} }; - struct TVirtualGroupPool { - using Table = Schema::VirtualGroupPool; - - TMaybe<Table::Generation::Type> Generation; - - TVirtualGroupPool() = default; - TVirtualGroupPool(const TVirtualGroupPool&) = default; - - template<typename T> - static void Apply(TBlobStorageController* /*controller*/, T&& callback) { - static TTableAdapter<Table, TVirtualGroupPool, - Table::Generation - > adapter( - &TVirtualGroupPool::Generation - ); - callback(&adapter); - } - - void OnCommit() {} - void OnClone(const THolder<TVirtualGroupPool>&) {} - }; - struct THostRecord { TNodeId NodeId; TNodeLocation Location; @@ -1418,7 +1387,6 @@ private: TVSlots VSlots; // ordering is important TPDisks PDisks; // ordering is important TMap<TSerial, THolder<TDriveSerialInfo>> DrivesSerials; - TMap<Schema::VirtualGroupPool::TKey::Type, TVirtualGroupPool> VirtualGroupPools; TGroups GroupMap; THashMap<TGroupId, TGroupInfo*> GroupLookup; TMap<TGroupSpecies, TVector<TGroupId>> IndexGroupSpeciesToGroup; @@ -1945,7 +1913,7 @@ public: } if (const TDuration time = TDuration::Seconds(timer.Passed()); time >= TDuration::MilliSeconds(100)) { - STLOG(PRI_ERROR, BS_CONTROLLER, BSC07, "StateWork event processing took too much time", (Type, type), + STLOG(PRI_ERROR, BS_CONTROLLER, BSC00, "StateWork event processing took too much time", (Type, type), (Duration, time)); } } diff --git a/ydb/core/mind/bscontroller/load_everything.cpp b/ydb/core/mind/bscontroller/load_everything.cpp index 3b7a9a2f950..6e8aa9741f6 100644 --- a/ydb/core/mind/bscontroller/load_everything.cpp +++ b/ydb/core/mind/bscontroller/load_everything.cpp @@ -176,9 +176,9 @@ public: const TBoxStoragePoolId storagePoolId = it->second; groupToStoragePool.erase(it); - const bool isVirtualGroup = TGroupID(groups.GetKey()).ConfigurationType() == EGroupConfigurationType::Virtual; + // geometry may be absent for virtual or finally decommitted group const auto geomIt = geometry.find(groups.GetKey()); - Y_VERIFY(isVirtualGroup || geomIt != geometry.end()); + const auto geom = geomIt != geometry.end() ? geomIt->second : std::make_tuple(0u, 0u, 0u); TGroupInfo& group = Self->AddGroup(groups.GetKey(), groups.GetValue<T::Generation>(), @@ -195,28 +195,21 @@ public: groups.GetValueOrDefault<T::Down>(), groups.GetValueOrDefault<T::SeenOperational>(), storagePoolId, - isVirtualGroup ? 0 : std::get<0>(geomIt->second), - isVirtualGroup ? 0 : std::get<1>(geomIt->second), - isVirtualGroup ? 0 : std::get<2>(geomIt->second)); + std::get<0>(geom), + std::get<1>(geom), + std::get<2>(geom)); group.DecommitStatus = groups.GetValueOrDefault<T::DecommitStatus>(); - if (groups.HaveValue<T::AssimilatorGroupId>()) { - group.AssimilatorGroupId = groups.GetValue<T::AssimilatorGroupId>(); - } #define OPTIONAL(NAME) \ if (groups.HaveValue<T::NAME>()) { \ group.NAME = groups.GetValue<T::NAME>(); \ } - OPTIONAL(VirtualGroupPool) + OPTIONAL(VirtualGroupName) OPTIONAL(VirtualGroupState) - OPTIONAL(ParentDir) - OPTIONAL(Name) - OPTIONAL(SchemeshardId) + OPTIONAL(HiveId) OPTIONAL(BlobDepotConfig) - OPTIONAL(TxId) - OPTIONAL(PathId) OPTIONAL(BlobDepotId) OPTIONAL(ErrorReason) @@ -234,8 +227,7 @@ public: if (!NTableAdapter::FetchTable<Schema::HostConfig>(db, Self, Self->HostConfigs) || !NTableAdapter::FetchTable<Schema::Box>(db, Self, Self->Boxes) || !NTableAdapter::FetchTable<Schema::BoxStoragePool>(db, Self, Self->StoragePools) - || !NTableAdapter::FetchTable<Schema::DriveSerial>(db, Self, Self->DrivesSerials) - || !NTableAdapter::FetchTable<Schema::VirtualGroupPool>(db, Self, Self->VirtualGroupPools)) { + || !NTableAdapter::FetchTable<Schema::DriveSerial>(db, Self, Self->DrivesSerials)) { return false; } for (const auto& [storagePoolId, storagePool] : Self->StoragePools) { diff --git a/ydb/core/mind/bscontroller/scheme.h b/ydb/core/mind/bscontroller/scheme.h index d724a498e63..93c8bd144a7 100644 --- a/ydb/core/mind/bscontroller/scheme.h +++ b/ydb/core/mind/bscontroller/scheme.h @@ -61,18 +61,13 @@ struct Schema : NIceDb::Schema { struct MainKeyVersion : Column<12, NScheme::NTypeIds::Uint64> { static constexpr Type Default = 0; }; struct Down : Column<13, NScheme::NTypeIds::Bool> { static constexpr Type Default = false; }; struct SeenOperational : Column<14, NScheme::NTypeIds::Bool> { static constexpr Type Default = false; }; - struct DecommitStatus : Column<15, NScheme::NTypeIds::Uint32> { using Type = NKikimrBlobStorage::EGroupDecommitStatus; }; - struct AssimilatorGroupId : Column<16, Group::ID::ColumnType> {}; // for the group being decommitted + struct DecommitStatus : Column<15, NScheme::NTypeIds::Uint32> { using Type = NKikimrBlobStorage::TGroupDecommitStatus::E; }; // VirtualGroup management code - struct VirtualGroupPool : Column<101, NScheme::NTypeIds::Utf8> {}; // VG pool identifier + struct VirtualGroupName : Column<112, NScheme::NTypeIds::Utf8> {}; // unique name of the virtual group struct VirtualGroupState : Column<102, NScheme::NTypeIds::Uint32> { using Type = NKikimrBlobStorage::EVirtualGroupState; }; - struct ParentDir : Column<103, NScheme::NTypeIds::Utf8> {}; // scheme directory containing blob depot being created - struct Name : Column<104, NScheme::NTypeIds::Utf8> {}; // name of the blob depot - struct SchemeshardId : Column<105, NScheme::NTypeIds::Uint64> {}; // enclosing Schemeshard tablet id + struct HiveId : Column<113, NScheme::NTypeIds::Uint64> {}; // hive id for this vg struct BlobDepotConfig : Column<106, NScheme::NTypeIds::String> {}; // serialized blob depot config protobuf - struct TxId : Column<107, NScheme::NTypeIds::Uint64> {}; // TxId for pending scheme operation - struct PathId : Column<108, NScheme::NTypeIds::Uint64> {}; // path id for created path struct BlobDepotId : Column<109, NScheme::NTypeIds::Uint64> {}; // created blobdepot tablet id struct ErrorReason : Column<110, NScheme::NTypeIds::Utf8> {}; // creation error reason struct NeedAlter : Column<111, NScheme::NTypeIds::Bool> {}; // did the BlobDepotConfig change? @@ -80,8 +75,8 @@ struct Schema : NIceDb::Schema { using TKey = TableKey<ID>; using TColumns = TableColumns<ID, Generation, ErasureSpecies, Owner, DesiredPDiskCategory, DesiredVDiskCategory, EncryptionMode, LifeCyclePhase, MainKeyId, EncryptedGroupKey, GroupKeyNonce, MainKeyVersion, Down, - SeenOperational, DecommitStatus, AssimilatorGroupId, VirtualGroupPool, VirtualGroupState, ParentDir, Name, - SchemeshardId, BlobDepotConfig, TxId, PathId, BlobDepotId, ErrorReason, NeedAlter>; + SeenOperational, DecommitStatus, VirtualGroupName, VirtualGroupState, HiveId, BlobDepotConfig, + BlobDepotId, ErrorReason, NeedAlter>; }; struct State : Table<1> { @@ -391,14 +386,6 @@ struct Schema : NIceDb::Schema { using TColumns = TableColumns<Serial, BoxId, NodeId, PDiskId, Guid, LifeStage, Kind, PDiskType, PDiskConfig>; }; - struct VirtualGroupPool : Table<130> { - struct Id : Column<1, NScheme::NTypeIds::Utf8> {}; - struct Generation : Column<2, NScheme::NTypeIds::Uint64> {}; - - using TKey = TableKey<Id>; - using TColumns = TableColumns<Id, Generation>; - }; - using TTables = SchemaTables< Node, PDisk, @@ -421,8 +408,7 @@ struct Schema : NIceDb::Schema { MigrationPlan, MigrationEntry, ScrubState, - DriveSerial, - VirtualGroupPool + DriveSerial >; using TSettings = SchemaSettings< diff --git a/ydb/core/mind/bscontroller/virtual_group.cpp b/ydb/core/mind/bscontroller/virtual_group.cpp index 768bc8f68cf..91afd867262 100644 --- a/ydb/core/mind/bscontroller/virtual_group.cpp +++ b/ydb/core/mind/bscontroller/virtual_group.cpp @@ -4,16 +4,25 @@ namespace NKikimr::NBsController { void TBlobStorageController::TConfigState::ExecuteStep(const NKikimrBlobStorage::TAllocateVirtualGroup& cmd, TStatus& status) { - const TString id = cmd.GetVirtualGroupPool(); - if (!id) { - throw TExError() << "VirtualGroupPool can't be empty"; + if (!cmd.GetName()) { + throw TExError() << "TAllocateVirtualGroup.Name must be set and be nonempty"; + } else if (!cmd.GetHiveId()) { + throw TExError() << "TAllocateVirtualGroup.HiveId is not specified"; } - // update record generation to prevent races - const ui64 nextGen = CheckGeneration(cmd, VirtualGroupPools.Get(), id); - auto& map = VirtualGroupPools.Unshare(); - auto& item = map[id]; - item.Generation = nextGen; + // make sure the operation is idempotent + struct TExFound { TGroupId id; }; + try { + Groups.ForEach([&](TGroupId key, const TGroupInfo& value) { + if (value.VirtualGroupName == cmd.GetName()) { + throw TExFound{key}; + } + }); + } catch (const TExFound& ex) { + status.AddGroupId(ex.id); + status.SetAlready(true); + return; + } // allocate group identifier auto& nextGroupId = NextVirtualGroupId.Unshare(); @@ -66,10 +75,9 @@ namespace NKikimr::NBsController { ++pool.NumGroups; StoragePoolGroups.Unshare().emplace(storagePoolId, group->ID); - group->VirtualGroupPool = id; - group->VirtualGroupState = NKikimrBlobStorage::EVirtualGroupState::CREATED; - group->ParentDir = cmd.GetParentDir(); - group->Name = TStringBuilder() << "vgroup" << groupId.GetRaw(); + group->VirtualGroupName = cmd.GetName(); + group->VirtualGroupState = NKikimrBlobStorage::EVirtualGroupState::NEW; + group->HiveId = cmd.GetHiveId(); if (cmd.GetBlobDepotId()) { group->VirtualGroupState = NKikimrBlobStorage::EVirtualGroupState::WORKING; @@ -86,46 +94,34 @@ namespace NKikimr::NBsController { const bool success = config.SerializeToString(&group->BlobDepotConfig.ConstructInPlace()); Y_VERIFY(success); - for (const TGroupId groupToDecommit : cmd.GetDecommitGroups()) { - InitiateGroupDecommission(group->ID, groupToDecommit); - } - status.AddGroupId(group->ID); } void TBlobStorageController::TConfigState::ExecuteStep(const NKikimrBlobStorage::TDecommitGroups& cmd, TStatus& /*status*/) { - for (const TGroupId groupToDecommit : cmd.GetDecommitGroups()) { - InitiateGroupDecommission(cmd.GetVirtualGroupId(), groupToDecommit); + if (!cmd.GetHiveId()) { + throw TExError() << "TDecommitGroups.HiveId is not specified"; } - } - void TBlobStorageController::TConfigState::InitiateGroupDecommission(ui32 virtualGroupId, ui32 groupToDecommit) { - TGroupInfo *virtualGroup = Groups.FindForUpdate(virtualGroupId); - if (!virtualGroup) { - throw TExError() << "virtual group not found" << TErrorParams::GroupId(virtualGroupId); - } else if (!virtualGroup->BlobDepotConfig) { - throw TExError() << "virtual group blob depot config is not filled in" << TErrorParams::GroupId(virtualGroup->ID); - } + for (const TGroupId groupId : cmd.GetGroupIds()) { + TGroupInfo *group = Groups.FindForUpdate(groupId); + if (!group) { + throw TExError() << "group not found" << TErrorParams::GroupId(groupId); + } else if (group->DecommitStatus != NKikimrBlobStorage::TGroupDecommitStatus::NONE) { + throw TExError() << "group is already being decommitted" << TErrorParams::GroupId(groupId); + } - auto *group = Groups.FindForUpdate(groupToDecommit); - if (!group) { - throw TExError() << "group id for decomission is not found" << TErrorParams::GroupId(groupToDecommit); - } else if (group->DecommitStatus != NKikimrBlobStorage::EGroupDecommitStatus::NONE) { - throw TExError() << "group is already being decommitted" << TErrorParams::GroupId(group->ID); - } else if (group->AssimilatorGroupId) { - throw TExError() << "don't know how, but AssimilatorGroupId is already filled in" << TErrorParams::GroupId(group->ID); - } - group->DecommitStatus = NKikimrBlobStorage::EGroupDecommitStatus::PENDING; - group->AssimilatorGroupId = virtualGroup->ID; - group->ContentChanged = true; // advance group generation to push configs forcibly to all concerned nodes + group->DecommitStatus = NKikimrBlobStorage::TGroupDecommitStatus::PENDING; + group->VirtualGroupState = NKikimrBlobStorage::EVirtualGroupState::NEW; + group->HiveId = cmd.GetHiveId(); + group->NeedAlter = true; - NKikimrBlobDepot::TBlobDepotConfig blobDepotConfig; - if (!blobDepotConfig.ParseFromString(*virtualGroup->BlobDepotConfig)) { - throw TExError() << "failed to parse virtual group blob depot config" << TErrorParams::GroupId(virtualGroup->ID); - } - blobDepotConfig.AddDecommittingGroups(groupToDecommit); - if (!blobDepotConfig.SerializeToString(&*virtualGroup->BlobDepotConfig)) { - throw TExError() << "failed to serialize virtual group blob depot config" << TErrorParams::GroupId(virtualGroup->ID); + NKikimrBlobDepot::TBlobDepotConfig config; + config.SetOperationMode(NKikimrBlobDepot::EOperationMode::VirtualGroup); + config.MutableChannelProfiles()->CopyFrom(cmd.GetChannelProfiles()); + config.SetDecommitGroupId(groupId); + + const bool success = config.SerializeToString(&group->BlobDepotConfig.ConstructInPlace()); + Y_VERIFY(success); } } @@ -156,12 +152,7 @@ namespace NKikimr::NBsController { row.Update<T::NAME>(*cell); \ } PARAM(VirtualGroupState) - PARAM(ParentDir) - PARAM(Name) - PARAM(SchemeshardId) PARAM(BlobDepotConfig) - PARAM(TxId) - PARAM(PathId) PARAM(BlobDepotId) PARAM(ErrorReason) PARAM(NeedAlter) @@ -187,6 +178,40 @@ namespace NKikimr::NBsController { } }; + class TTxCommitDecommit : public TTransactionBase<TBlobStorageController> { + TVirtualGroupSetupMachine *Machine; + std::optional<TConfigState> State; + + public: + TTxCommitDecommit(TVirtualGroupSetupMachine *machine) + : TTransactionBase(machine->Self) + , Machine(machine) + {} + + bool Execute(TTransactionContext& txc, const TActorContext&) override { + State.emplace(*Self, Self->HostRecords, TActivationContext::Now()); + State->CheckConsistency(); + TGroupInfo *group = State->Groups.FindForUpdate(Machine->GroupId); + if (group && group->DecommitStatus == NKikimrBlobStorage::TGroupDecommitStatus::PENDING) { + group->DecommitStatus = NKikimrBlobStorage::TGroupDecommitStatus::IN_PROGRESS; + group->ContentChanged = true; + } + State->CheckConsistency(); + TString error; + if (State->Changed() && !Self->CommitConfigUpdates(*State, false, false, txc, &error)) { + State->Rollback(); + State.reset(); + } + return true; + } + + void Complete(const TActorContext&) override { + if (State) { + State->ApplyConfigUpdates(); + } + } + }; + public: TVirtualGroupSetupMachine(TBlobStorageController *self, TGroupInfo& group) : Self(self) @@ -204,143 +229,140 @@ namespace NKikimr::NBsController { Y_VERIFY(group->VirtualGroupState); STLOG(PRI_DEBUG, BS_CONTROLLER, BSCVG01, "Bootstrap", (GroupId, GroupId), - (State, NKikimrBlobStorage::EVirtualGroupState_Name(*group->VirtualGroupState))); + (State, NKikimrBlobStorage::EVirtualGroupState_Name(*group->VirtualGroupState)), + (NeedAlter, group->NeedAlter)); switch (*group->VirtualGroupState) { - case NKikimrBlobStorage::EVirtualGroupState::CREATED: - IssueCreateOrAlterPathRequest(group, NKikimrSchemeOp::ESchemeOpCreateBlobDepot); + case NKikimrBlobStorage::EVirtualGroupState::NEW: + HiveCreate(group); break; case NKikimrBlobStorage::EVirtualGroupState::WORKING: + if (group->NeedAlter.GetOrElse(false)) { + return ConfigureBlobDepot(); + } + [[fallthrough]]; case NKikimrBlobStorage::EVirtualGroupState::CREATE_FAILED: IssueNodeNotifications(group); group->VirtualGroupSetupMachineId = {}; PassAway(); break; - case NKikimrBlobStorage::EVirtualGroupState::WAIT_SCHEMESHARD_CREATE: - SubscribeToSchemeshard(group); - break; - - case NKikimrBlobStorage::EVirtualGroupState::WAIT_SCHEMESHARD_ALTER: - IssueCreateOrAlterPathRequest(group, NKikimrSchemeOp::ESchemeOpAlterBlobDepot); - group->NeedAlter = false; - break; - default: Y_FAIL(); } } - void IssueCreateOrAlterPathRequest(TGroupInfo *group, NKikimrSchemeOp::EOperationType op) { - STLOG(PRI_DEBUG, BS_CONTROLLER, BSCVG02, "IssueCreateOrAlterPathRequest", (GroupId, GroupId), - (ParentDir, *group->ParentDir), (Name, *group->Name)); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////// - auto request = std::make_unique<TEvTxUserProxy::TEvProposeTransaction>(); - auto& record = request->Record; + TActorId HivePipeId; + TActorId BlobDepotPipeId; + ui64 BlobDepotTabletId = 0; - Y_VERIFY(group->ParentDir); - Y_VERIFY(group->Name); - Y_VERIFY(group->BlobDepotConfig); + void HiveCreate(TGroupInfo *group) { + Y_VERIFY(group->HiveId); + HivePipeId = Register(NTabletPipe::CreateClient(SelfId(), *group->HiveId, + NTabletPipe::TClientRetryPolicy::WithRetries())); - auto *tx = record.MutableTransaction(); - auto *scheme = tx->MutableModifyScheme(); - scheme->SetWorkingDir(*group->ParentDir); - scheme->SetOperationType(op); - scheme->SetInternal(true); - scheme->SetFailOnExist(false); // this operation has to be idempotent - auto *bd = scheme->MutableBlobDepot(); - bd->SetName(*group->Name); - const bool success = bd->MutableConfig()->ParseFromString(*group->BlobDepotConfig); + NKikimrBlobDepot::TBlobDepotConfig config; + Y_VERIFY(group->BlobDepotConfig); + const bool success = config.ParseFromString(*group->BlobDepotConfig); Y_VERIFY(success); - Send(MakeTxProxyID(), request.release()); - } - - void Handle(TEvTxUserProxy::TEvProposeTransactionStatus::TPtr ev) { - if (Expired()) { - return PassAway(); - } - - TGroupInfo *group = GetGroup(); - - STLOG(PRI_DEBUG, BS_CONTROLLER, BSCVG01, "got TEvProposeTransactionStatus", (GroupId, GroupId), - (ParentDir, group->ParentDir), (Name, group->Name), (Msg, ev->Get()->Record.DebugString())); - - const auto& record = ev->Get()->Record; - if (record.GetSchemeShardStatus() == NKikimrScheme::StatusAccepted || record.GetSchemeShardStatus() == NKikimrScheme::StatusAlreadyExists) { - group->VirtualGroupState = NKikimrBlobStorage::EVirtualGroupState::WAIT_SCHEMESHARD_CREATE; - group->SchemeshardId = record.GetSchemeShardTabletId(); - group->TxId = record.GetTxId(); - group->PathId = record.GetPathId(); - } else { - group->VirtualGroupState = NKikimrBlobStorage::EVirtualGroupState::CREATE_FAILED; - group->ErrorReason = record.GetSchemeShardReason(); + TChannelsBindings bindings; + for (const auto& item : config.GetChannelProfiles()) { + for (ui32 i = 0; i < item.GetCount(); ++i) { + NKikimrStoragePool::TChannelBind binding; + binding.SetStoragePoolName(item.GetStoragePoolName()); + bindings.push_back(std::move(binding)); + } } - group->CalculateGroupStatus(); - - Self->Execute(new TTxUpdateGroup(this)); - } - - //////////////////////////////////////////////////////////////////////////////////////////////////////////////// - TActorId SchemeshardPipeId; + auto ev = std::make_unique<TEvHive::TEvCreateTablet>(Self->TabletID(), group->ID, TTabletTypes::BlobDepot, bindings); + STLOG(PRI_INFO, BS_CONTROLLER, BSCVG00, "sending TEvCreateTablet", (TabletId, Self->TabletID()), + (GroupId, group->ID), (HiveId, *group->HiveId), (Msg, ev->Record)); - void SubscribeToSchemeshard(TGroupInfo *group) { - Y_VERIFY(group->SchemeshardId); - Y_VERIFY(group->TxId); - SchemeshardPipeId = Register(NTabletPipe::CreateClient(SelfId(), *group->SchemeshardId)); - NTabletPipe::SendData(SelfId(), SchemeshardPipeId, new NSchemeShard::TEvSchemeShard::TEvNotifyTxCompletion(*group->TxId)); + NTabletPipe::SendData(SelfId(), HivePipeId, ev.release()); } void Handle(TEvTabletPipe::TEvClientConnected::TPtr ev) { - STLOG(PRI_DEBUG, BS_CONTROLLER, BSCVG03, "TEvClientConnected", (GroupId, GroupId), (Msg, ev->Get()->ToString())); + STLOG(PRI_DEBUG, BS_CONTROLLER, BSCVG02, "received TEvClientConnected", (TabletId, Self->TabletID()), + (Status, ev->Get()->Status), (ClientId, ev->Get()->ClientId), (HivePipeId, HivePipeId), + (BlobDepotPipeId, BlobDepotPipeId)); + + if (ev->Get()->Status != NKikimrProto::OK) { + if (ev->Get()->ClientId == HivePipeId) { + HiveCreate(GetGroup()); + } else if (ev->Get()->ClientId == BlobDepotPipeId) { + ConfigureBlobDepot(); + } + } } void Handle(TEvTabletPipe::TEvClientDestroyed::TPtr ev) { - STLOG(PRI_NOTICE, BS_CONTROLLER, BSCVG04, "TEvClientDestroyed", (GroupId, GroupId), (Msg, ev->Get()->ToString())); - SchemeshardPipeId = {}; - TActivationContext::Schedule(TDuration::Seconds(5), new IEventHandle(TEvents::TSystem::Bootstrap, 0, SelfId(), {}, nullptr, 0)); + STLOG(PRI_DEBUG, BS_CONTROLLER, BSCVG03, "received TEvClientDestroyed", (TabletId, Self->TabletID()), + (ClientId, ev->Get()->ClientId), (HivePipeId, HivePipeId), (BlobDepotPipeId, BlobDepotPipeId)); + + OnPipeError(ev->Get()->ClientId); } - void Handle(NSchemeShard::TEvSchemeShard::TEvNotifyTxCompletionRegistered::TPtr ev) { - STLOG(PRI_DEBUG, BS_CONTROLLER, BSCVG05, "TEvNotifyTxCompletionRegistered", (GroupId, GroupId), (Msg, ev->Get()->ToString())); + void OnPipeError(TActorId clientId) { + if (clientId == HivePipeId) { + HivePipeId = {}; + } else if (clientId == BlobDepotPipeId) { + BlobDepotPipeId = {}; + } } - void Handle(NSchemeShard::TEvSchemeShard::TEvNotifyTxCompletionResult::TPtr ev) { - STLOG(PRI_DEBUG, BS_CONTROLLER, BSCVG06, "TEvNotifyTxCompletionResult", (GroupId, GroupId), (Msg, ev->Get()->ToString())); + void Handle(TEvHive::TEvCreateTabletReply::TPtr ev) { + STLOG(PRI_INFO, BS_CONTROLLER, BSCVG04, "received TEvCreateTabletReply", (TabletId, Self->TabletID()), + (Msg, ev->Get()->Record)); - if (Expired()) { - return PassAway(); + TGroupInfo *group = GetGroup(); + auto& record = ev->Get()->Record; + if (record.GetStatus() == NKikimrProto::OK || record.GetStatus() == NKikimrProto::ALREADY) { + BlobDepotTabletId = record.GetTabletID(); + Y_VERIFY(BlobDepotTabletId); + } else { + group->VirtualGroupState = NKikimrBlobStorage::EVirtualGroupState::CREATE_FAILED; + group->ErrorReason = SingleLineProto(record); + Self->Execute(new TTxUpdateGroup(this)); } - TGroupInfo *group = GetGroup(); - Y_VERIFY(group->SchemeshardId); - Y_VERIFY(group->PathId); - NTabletPipe::SendData(SelfId(), SchemeshardPipeId, new NSchemeShard::TEvSchemeShard::TEvDescribeScheme( - *group->SchemeshardId, *group->PathId)); + NTabletPipe::CloseAndForgetClient(SelfId(), HivePipeId); } - void Handle(NSchemeShard::TEvSchemeShard::TEvDescribeSchemeResult::TPtr ev) { - const auto& record = ev->Get()->GetRecord(); - - STLOG(PRI_DEBUG, BS_CONTROLLER, BSCVG07, "TEvDescribeSchemeResult", (GroupId, GroupId), - (Record, record)); - - if (Expired()) { - return PassAway(); - } + void Handle(TEvHive::TEvTabletCreationResult::TPtr ev) { + STLOG(PRI_INFO, BS_CONTROLLER, BSCVG05, "received TEvTabletCreationResult", (TabletId, Self->TabletID()), + (Msg, ev->Get()->Record)); + ConfigureBlobDepot(); + } + + void ConfigureBlobDepot() { TGroupInfo *group = GetGroup(); + const ui64 tabletId = group->BlobDepotId.GetOrElse(BlobDepotTabletId); + BlobDepotPipeId = Register(NTabletPipe::CreateClient(SelfId(), tabletId, + NTabletPipe::TClientRetryPolicy::WithRetries())); + auto ev = std::make_unique<TEvBlobDepot::TEvApplyConfig>(); + Y_VERIFY(group->BlobDepotConfig); + const bool success = ev->Record.MutableConfig()->ParseFromString(*group->BlobDepotConfig); + Y_VERIFY(success); + NTabletPipe::SendData(SelfId(), BlobDepotPipeId, ev.release()); + } - const auto& desc = record.GetPathDescription().GetBlobDepotDescription(); + void Handle(TEvBlobDepot::TEvApplyConfigResult::TPtr /*ev*/) { + TGroupInfo *group = GetGroup(); group->VirtualGroupState = NKikimrBlobStorage::EVirtualGroupState::WORKING; - group->BlobDepotId = desc.GetTabletId(); - group->SeenOperational = true; - group->CalculateGroupStatus(); - Y_VERIFY(*group->BlobDepotId); - + group->BlobDepotId = BlobDepotTabletId; + group->NeedAlter = false; // race-check Self->Execute(new TTxUpdateGroup(this)); + + if (group->DecommitStatus == NKikimrBlobStorage::TGroupDecommitStatus::PENDING) { + Self->Execute(new TTxCommitDecommit(this)); + } + + NTabletPipe::CloseAndForgetClient(SelfId(), BlobDepotPipeId); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -360,21 +382,29 @@ namespace NKikimr::NBsController { //////////////////////////////////////////////////////////////////////////////////////////////////////////////// void PassAway() override { - NTabletPipe::CloseAndForgetClient(SelfId(), SchemeshardPipeId); + NTabletPipe::CloseClient(SelfId(), HivePipeId); + NTabletPipe::CloseClient(SelfId(), BlobDepotPipeId); TActorBootstrapped::PassAway(); } - STRICT_STFUNC(StateFunc, - cFunc(TEvents::TSystem::Poison, PassAway); - cFunc(TEvents::TSystem::Bootstrap, Bootstrap); - hFunc(TEvTxUserProxy::TEvProposeTransactionStatus, Handle); - - hFunc(TEvTabletPipe::TEvClientConnected, Handle); - hFunc(TEvTabletPipe::TEvClientDestroyed, Handle); - hFunc(NSchemeShard::TEvSchemeShard::TEvNotifyTxCompletionRegistered, Handle); - hFunc(NSchemeShard::TEvSchemeShard::TEvNotifyTxCompletionResult, Handle); - hFunc(NSchemeShard::TEvSchemeShard::TEvDescribeSchemeResult, Handle); - ) + void StateFunc(STFUNC_SIG) { + Y_UNUSED(ctx); + if (Expired()) { + return PassAway(); + } + switch (const ui32 type = ev->GetTypeRewrite()) { + cFunc(TEvents::TSystem::Poison, PassAway); + cFunc(TEvents::TSystem::Bootstrap, Bootstrap); + hFunc(TEvTabletPipe::TEvClientConnected, Handle); + hFunc(TEvTabletPipe::TEvClientDestroyed, Handle); + hFunc(TEvHive::TEvCreateTabletReply, Handle); + hFunc(TEvHive::TEvTabletCreationResult, Handle); + hFunc(TEvBlobDepot::TEvApplyConfigResult, Handle); + + default: + Y_VERIFY_DEBUG(false, "unexpected event Type# %08" PRIx32, type); + } + } TGroupInfo *GetGroup() { TGroupInfo *res = Self->FindGroup(GroupId); diff --git a/ydb/core/protos/blob_depot.proto b/ydb/core/protos/blob_depot.proto index ae94e3c8368..7cf3743c679 100644 --- a/ydb/core/protos/blob_depot.proto +++ b/ydb/core/protos/blob_depot.proto @@ -76,7 +76,7 @@ message TEvRegisterAgentResult { } optional uint32 Generation = 1; repeated TChannelKind ChannelKinds = 2; - repeated uint32 DecommittingGroups = 3; + optional uint32 DecommitGroupId = 3; } message TEvAllocateIds { diff --git a/ydb/core/protos/blob_depot_config.proto b/ydb/core/protos/blob_depot_config.proto index 4b77e4e0a7d..da5f50ca624 100644 --- a/ydb/core/protos/blob_depot_config.proto +++ b/ydb/core/protos/blob_depot_config.proto @@ -9,9 +9,10 @@ message TChannelKind { } message TChannelProfile { - optional string StoragePoolKind = 1; - optional uint32 Count = 2; - optional TChannelKind.E ChannelKind = 3; + optional string StoragePoolName = 1; // used when creating tablet through BSC -> Hive + optional string StoragePoolKind = 2; // used when creating tablet through Schemeshard + optional uint32 Count = 3; + optional TChannelKind.E ChannelKind = 4; } enum EOperationMode { @@ -22,5 +23,5 @@ enum EOperationMode { message TBlobDepotConfig { optional EOperationMode OperationMode = 1; // can't be changed after tablet is created repeated TChannelProfile ChannelProfiles = 2; - repeated uint32 DecommittingGroups = 3; + optional uint32 DecommitGroupId = 3; // group that is being decommitted by this BlobDepot } diff --git a/ydb/core/protos/blobstorage.proto b/ydb/core/protos/blobstorage.proto index c8ccbc568de..0ea61ce9abb 100644 --- a/ydb/core/protos/blobstorage.proto +++ b/ydb/core/protos/blobstorage.proto @@ -164,6 +164,15 @@ enum EEntityStatus { RESTART = 4; // entity has changed config or changed environment and should be restarted by warden } +message TGroupDecommitStatus { + enum E { + NONE = 0; // no decomission + PENDING = 1; // decommission machinery is starting + IN_PROGRESS = 2; // decomission underway + DONE = 3; // group decomission complete + } +} + message TGroupInfo { message TFailRealm { message TFailDomain { @@ -187,7 +196,7 @@ message TGroupInfo { optional string StoragePoolName = 13; optional EPDiskType DeviceType = 14; optional uint64 BlobDepotId = 15; // if filled, then this is virtual group - optional uint32 AssimilatorGroupId = 16; // if filled, then this group is being assimilated by AssimilatorGroupId + optional TGroupDecommitStatus.E DecommitStatus = 16; } message TEvVPatchStart { diff --git a/ydb/core/protos/blobstorage_config.proto b/ydb/core/protos/blobstorage_config.proto index 1f33a5fbaa7..b53765e4e00 100644 --- a/ydb/core/protos/blobstorage_config.proto +++ b/ydb/core/protos/blobstorage_config.proto @@ -18,18 +18,9 @@ enum EPDiskType { // Virtual group life cycle state enum EVirtualGroupState { - CREATED = 0; // just created - WAIT_SCHEMESHARD_CREATE = 1; // waiting for path to be created - CREATE_FAILED = 2; // error while creating VirtualGroup - WORKING = 3; // operational - WAIT_SCHEMESHARD_DELETE = 4; // waiting for path to be deleted - WAIT_SCHEMESHARD_ALTER = 5; // waiting for path to be altered (in case of configuration update) -} - -enum EGroupDecommitStatus { - NONE = 0; // no decomission - PENDING = 1; // decomission was asked - NO_WRITES_QUORUM_OBTAINED = 2; + NEW = 0; // just created + CREATE_FAILED = 1; // error while creating VirtualGroup + WORKING = 2; // operational } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -461,21 +452,20 @@ message TBoxStoragePoolId { } message TAllocateVirtualGroup { - string VirtualGroupPool = 1; - uint64 ItemConfigGeneration = 2; + string Name = 1; + uint64 HiveId = 2; oneof StoragePool { string StoragePoolName = 3; TBoxStoragePoolId StoragePoolId = 4; } - string ParentDir = 5; - repeated NKikimrBlobDepot.TChannelProfile ChannelProfiles = 6; - repeated uint32 DecommitGroups = 7; - uint64 BlobDepotId = 8; // when the tablet is already created; for testing purposes only + repeated NKikimrBlobDepot.TChannelProfile ChannelProfiles = 5; + uint64 BlobDepotId = 6; // when the tablet is already created; for testing purposes only } message TDecommitGroups { - uint32 VirtualGroupId = 1; - repeated uint32 DecommitGroups = 2; + repeated uint32 GroupIds = 1; // group ids to decommit + uint64 HiveId = 2; // hive under which it is required to create blob depot tablets + repeated NKikimrBlobDepot.TChannelProfile ChannelProfiles = 3; // where to store decommitted groups' data } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -704,6 +694,7 @@ message TConfigResponse { repeated TPDiskStat PDiskStat = 11; repeated TReassignedItem ReassignedItem = 14; TDeclareIntent Intent = 15; + bool Already = 16; } repeated TStatus Status = 1; diff --git a/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema b/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema index 744e2e50a55..220e3a02e03 100644 --- a/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema +++ b/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema @@ -382,61 +382,26 @@ "ColumnType": "Bool" }, { - "ColumnId": 101, - "ColumnName": "VirtualGroupPool", - "ColumnType": "Utf8" - }, - { "ColumnId": 14, "ColumnName": "SeenOperational", "ColumnType": "Bool" }, { - "ColumnId": 102, - "ColumnName": "VirtualGroupState", - "ColumnType": "Uint32" - }, - { "ColumnId": 15, "ColumnName": "DecommitStatus", "ColumnType": "Uint32" }, { - "ColumnId": 103, - "ColumnName": "ParentDir", - "ColumnType": "Utf8" - }, - { - "ColumnId": 16, - "ColumnName": "AssimilatorGroupId", + "ColumnId": 102, + "ColumnName": "VirtualGroupState", "ColumnType": "Uint32" }, { - "ColumnId": 104, - "ColumnName": "Name", - "ColumnType": "Utf8" - }, - { - "ColumnId": 105, - "ColumnName": "SchemeshardId", - "ColumnType": "Uint64" - }, - { "ColumnId": 106, "ColumnName": "BlobDepotConfig", "ColumnType": "String" }, { - "ColumnId": 107, - "ColumnName": "TxId", - "ColumnType": "Uint64" - }, - { - "ColumnId": 108, - "ColumnName": "PathId", - "ColumnType": "Uint64" - }, - { "ColumnId": 109, "ColumnName": "BlobDepotId", "ColumnType": "Uint64" @@ -450,6 +415,16 @@ "ColumnId": 111, "ColumnName": "NeedAlter", "ColumnType": "Bool" + }, + { + "ColumnId": 112, + "ColumnName": "VirtualGroupName", + "ColumnType": "Utf8" + }, + { + "ColumnId": 113, + "ColumnName": "HiveId", + "ColumnType": "Uint64" } ], "ColumnsDropped": [], @@ -469,20 +444,15 @@ 11, 12, 13, - 101, 14, - 102, 15, - 103, - 16, - 104, - 105, + 102, 106, - 107, - 108, 109, 110, - 111 + 111, + 112, + 113 ], "RoomID": 0, "Codec": 0, @@ -1519,47 +1489,6 @@ } }, { - "TableId": 130, - "TableName": "VirtualGroupPool", - "TableKey": [ - 1 - ], - "ColumnsAdded": [ - { - "ColumnId": 1, - "ColumnName": "Id", - "ColumnType": "Utf8" - }, - { - "ColumnId": 2, - "ColumnName": "Generation", - "ColumnType": "Uint64" - } - ], - "ColumnsDropped": [], - "ColumnFamilies": { - "0": { - "Columns": [ - 1, - 2 - ], - "RoomID": 0, - "Codec": 0, - "InMemory": false, - "Cache": 0, - "Small": 4294967295, - "Large": 4294967295 - } - }, - "Rooms": { - "0": { - "Main": 1, - "Outer": 1, - "Blobs": 1 - } - } - }, - { "TableId": 101, "TableName": "BoxUser", "TableKey": [ |