summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorzverevgeny <[email protected]>2025-06-02 03:46:08 +0300
committerGitHub <[email protected]>2025-06-02 00:46:08 +0000
commit88225b68cc277a33ee6217f1de8755a2ed760aaf (patch)
tree1ccb580609323055b96577284e92d096ebc9f752
parent08f5f3c117321bfbc38ea44db681b97784fbea38 (diff)
Fix serialized batch stat calculation (#19106)
-rw-r--r--ydb/library/formats/arrow/splitter/stats.cpp4
-rw-r--r--ydb/library/formats/arrow/splitter/stats.h41
2 files changed, 28 insertions, 17 deletions
diff --git a/ydb/library/formats/arrow/splitter/stats.cpp b/ydb/library/formats/arrow/splitter/stats.cpp
index 5235cdccb44..bd82a70b2b4 100644
--- a/ydb/library/formats/arrow/splitter/stats.cpp
+++ b/ydb/library/formats/arrow/splitter/stats.cpp
@@ -45,10 +45,10 @@ std::vector<i64> TSimpleSerializationStat::SplitRecords(
}
std::vector<i64> TBatchSerializationStat::SplitRecordsForBlobSize(const i64 recordsCount, const ui64 blobSize) const {
- if (!SerializedBytesPerRecord) {
+ if (!SerializedBytes) {
return { recordsCount };
}
- const ui32 recordsCountPerBlob = blobSize / SerializedBytesPerRecord;
+ const ui32 recordsCountPerBlob = blobSize / GetSerializedBytesPerRecord();
return TSimilarPacker::SplitWithExpected(recordsCount, recordsCountPerBlob);
}
diff --git a/ydb/library/formats/arrow/splitter/stats.h b/ydb/library/formats/arrow/splitter/stats.h
index 2de398bf452..cacd1fbb422 100644
--- a/ydb/library/formats/arrow/splitter/stats.h
+++ b/ydb/library/formats/arrow/splitter/stats.h
@@ -69,49 +69,60 @@ public:
class TBatchSerializationStat {
protected:
- double SerializedBytesPerRecord = 0;
- double RawBytesPerRecord = 0;
+ ui64 RecordCount = 0;
+ double SerializedBytes = 0;
+ double RawBytes = 0;
+protected:
+ double GetSerializedBytesPerRecord() const {
+ return SerializedBytes / RecordCount;
+ }
+ double GetRawBytesPerRecord() const {
+ return RawBytes / RecordCount;
+ }
public:
TBatchSerializationStat() = default;
TBatchSerializationStat(const ui64 bytes, const ui64 recordsCount, const ui64 rawBytes) {
Y_ABORT_UNLESS(recordsCount);
- SerializedBytesPerRecord = 1.0 * bytes / recordsCount;
- RawBytesPerRecord = 1.0 * rawBytes / recordsCount;
+ RecordCount = recordsCount;
+ SerializedBytes = bytes;
+ RawBytes = rawBytes;
}
TString DebugString() const {
- return TStringBuilder() << "{sbpr=" << SerializedBytesPerRecord << ";rbpr=" << RawBytesPerRecord << "}";
+ return TStringBuilder() << "{sbpr=" << GetSerializedBytesPerRecord() << ";rbpr=" << GetRawBytesPerRecord() << "}";
}
TBatchSerializationStat(const TSimpleSerializationStat& simple) {
- SerializedBytesPerRecord = simple.GetSerializedBytesPerRecord();
- RawBytesPerRecord = simple.GetRawBytesPerRecord();
+ RecordCount = simple.GetRecordsCount();
+ SerializedBytes = simple.GetSerializedBytes();
+ RawBytes = simple.GetRawBytes();
}
void Merge(const TSimpleSerializationStat& item) {
- SerializedBytesPerRecord += item.GetSerializedBytesPerRecord();
- RawBytesPerRecord += item.GetRawBytesPerRecord();
+ RecordCount += item.GetRecordsCount();
+ SerializedBytes += item.GetSerializedBytes();
+ RawBytes += item.GetRawBytes();
}
std::vector<i64> SplitRecordsForBlobSize(const i64 recordsCount, const ui64 blobSize) const;
- std::optional<ui64> PredictOptimalPackRecordsCount(const ui64 recordsCount, const ui64 blobSize) const {
- if (!SerializedBytesPerRecord) {
+ std::optional<ui64> PredictOptimalPackRecordsCount(const ui64 recordsCount, const ui64 blobSize) const {
+ if (!SerializedBytes) {
return {};
}
- const ui64 fullSize = 1.0 * recordsCount * SerializedBytesPerRecord;
+ const ui64 fullSize = recordsCount * GetSerializedBytesPerRecord();
if (fullSize < blobSize) {
return recordsCount;
} else {
- return std::floor(1.0 * blobSize / SerializedBytesPerRecord);
+ return std::floor(blobSize / GetSerializedBytesPerRecord());
}
}
std::optional<ui64> PredictOptimalSplitFactor(const ui64 recordsCount, const ui64 blobSize) const {
- if (!SerializedBytesPerRecord) {
+ if (!SerializedBytes) {
return {};
}
- const ui64 fullSize = 1.0 * recordsCount * SerializedBytesPerRecord;
+ const ui64 fullSize = recordsCount * GetSerializedBytesPerRecord();
if (fullSize < blobSize) {
return 1;
} else {