#pragma once
#include <util/generic/vector.h>
#include <util/generic/buffer.h>
#include <util/generic/hash_set.h>
#include <util/generic/cast.h>
#include <util/generic/ymath.h>
#include <util/memory/blob.h>
#include <util/stream/buffer.h>
#include <util/stream/mem.h>
#include <util/system/unaligned_mem.h>
#include <util/ysaveload.h>
#include "reader.h"
#include "writer.h"
#include <cmath>
#include <cstddef>
template <typename T>
class TYVector {
private:
ui32 Size;
const T* Data;
public:
TYVector(const TBlob& blob)
: Size(IntegerCast<ui32>(ReadUnaligned<ui64>(blob.Data())))
, Data((const T*)((const char*)blob.Data() + sizeof(ui64)))
{
}
void Get(size_t idx, T& t) const {
assert(idx < (size_t)Size);
t = ReadUnaligned<T>(Data + idx);
}
const T& At(size_t idx) const {
assert(idx < (size_t)Size);
return Data[idx];
}
size_t GetSize() const {
return Size;
}
size_t RealSize() const {
return sizeof(ui64) + Size * sizeof(T);
}
~TYVector() = default;
};
template <typename T>
class TYVectorWriter {
private:
TVector<T> Vector;
public:
TYVectorWriter() = default;
void PushBack(const T& value) {
Vector.push_back(value);
}
void Save(IOutputStream& out) const {
ui64 uSize = (ui64)Vector.size();
out.Write(&uSize, sizeof(uSize));
out.Write(Vector.data(), Vector.size() * sizeof(T));
}
const T& At(size_t idx) const {
assert(idx < Size());
return Vector[idx];
}
T& At(size_t idx) {
assert(idx < Size());
return Vector[idx];
}
void Clear() {
Vector.clear();
}
size_t Size() const {
return Vector.size();
}
void Resize(size_t size) {
Vector.resize(size);
}
void Resize(size_t size, const T& value) {
Vector.resize(size, value);
}
};
template <typename T, bool>
struct TYVectorG;
template <typename X>
struct TYVectorG<X, false> {
typedef TYVector<X> T;
};
template <typename X>
struct TYVectorG<X, true> {
typedef TYVectorWriter<X> T;
};
template <typename T>
struct TIsMemsetThisWithZeroesSupported {
enum {
Result = TTypeTraits<T>::IsPod
};
};
#define MEMSET_THIS_WITH_ZEROES_SUPPORTED(type) \
template <> \
struct TIsMemsetThisWithZeroesSupported<type> { \
enum { \
Result = true \
}; \
};
class TPlainHashCommon {
protected:
#pragma pack(push, 8)
template <typename TKey, typename TValue>
class TPackedPair {
private:
typedef TPackedPair<TKey, TValue> TThis;
TKey Key;
TValue Value;
private:
static_assert(TIsMemsetThisWithZeroesSupported<TKey>::Result, "expect TIsMemsetThisWithZeroesSupported<TKey>::Result");
static_assert(TIsMemsetThisWithZeroesSupported<TValue>::Result, "expect TIsMemsetThisWithZeroesSupported<TValue>::Result");
/// to aviod uninitialized bytes
void Init(const TKey& key, const TValue& value) {
memset(static_cast<TThis*>(this), 0, sizeof(TThis));
Key = key;
Value = value;
}
public:
TPackedPair(typename TTypeTraits<TKey>::TFuncParam key, typename TTypeTraits<TValue>::TFuncParam value) {
Init(key, value);
}
TPackedPair(const TThis& rhs) {
Init(rhs.Key, rhs.Value);
}
TPackedPair& operator=(const TThis& rhs) {
if (this != &rhs) {
Init(rhs.Key, rhs.Value);
}
return *this;
}
TPackedPair() {
Init(TKey(), TValue());
}
typename TTypeTraits<TKey>::TFuncParam First() const {
return Key;
}
typename TTypeTraits<TValue>::TFuncParam Second() const {
return Value;
}
static TKey GetFirst(const void* self) {
static constexpr size_t offset = offsetof(TThis, Key);
return ReadUnaligned<TKey>(reinterpret_cast<const char*>(self) + offset);
}
static TValue GetSecond(const void* self) {
static constexpr size_t offset = offsetof(TThis, Value);
return ReadUnaligned<TValue>(reinterpret_cast<const char*>(self) + offset);
}
};
#pragma pack(pop)
protected:
static const ui16 VERSION_ID = 2;
#pragma pack(push, 8)
struct TInterval {
static const ui32 INVALID = (ui32)-1;
ui32 Offset;
ui32 Length;
TInterval()
: Offset(INVALID)
, Length(INVALID)
{
}
TInterval(ui32 offset, ui32 length)
: Offset(offset)
, Length(length)
{
}
static inline ui32 GetOffset(const TInterval* self) {
static constexpr size_t offset = offsetof(TInterval, Offset);
return ReadUnaligned<ui32>(reinterpret_cast<const char*>(self) + offset);
}
static inline ui32 GetLength(const TInterval* self) {
static constexpr size_t offset = offsetof(TInterval, Length);
return ReadUnaligned<ui32>(reinterpret_cast<const char*>(self) + offset);
}
};
#pragma pack(pop)
static_assert(8 == sizeof(TInterval), "expect 8 == sizeof(TInterval)");
template <typename TKey>
static ui32 KeyHash(typename TTypeTraits<TKey>::TFuncParam key, ui16 bits) {
Y_ASSERT(bits < 32);
const ui32 res = ui32(key) & ((ui32(1) << bits) - 1);
Y_ASSERT(res < (ui32(1) << bits));
return res;
}
};
template <typename TKey, typename TValue>
class TPlainHashWriter : TPlainHashCommon {
private:
typedef TPackedPair<TKey, TValue> TKeyValuePair;
typedef TVector<TKeyValuePair> TData;
TData Data;
typedef TVector<TData> TData2;
bool IsPlainEnought(ui16 bits) const {
TVector<size_t> counts(1LL << bits, 0);
for (size_t i = 0; i < Data.size(); ++i) {
size_t& count = counts[KeyHash<TKey>(TKeyValuePair::GetFirst(&Data[i]), bits)];
++count;
if (count > 2)
return false;
}
return true;
}
public:
void Add(const TKey& key, const TValue& value) {
Data.push_back(TKeyValuePair(key, value));
}
void Save(IOutputStream& out) const {
Y_ASSERT(Data.size() < Max<ui32>());
WriteBin<ui16>(&out, VERSION_ID);
static const ui32 PAIR_SIZE = sizeof(TKeyValuePair);
WriteBin<ui32>(&out, PAIR_SIZE);
ui16 bits;
if (!Data.empty()) {
bits = (ui16)(log((float)Data.size()) / log(2.f));
while ((bits < 22) && !IsPlainEnought(bits))
++bits;
} else {
bits = 0;
}
WriteBin<ui16>(&out, bits);
WriteBin<ui32>(&out, (ui32)Data.size());
const ui32 nBuckets = ui32(1) << bits;
TData2 data2(nBuckets);
for (size_t i = 0; i < Data.size(); ++i)
data2[KeyHash<TKey>(TKeyValuePair::GetFirst(&Data[i]), bits)].push_back(Data[i]);
typedef TVector<TInterval> TIntervals;
TIntervals intervals(nBuckets);
ui32 offset = 0;
for (ui32 i = 0; i < nBuckets; ++i) {
intervals[i].Offset = offset;
intervals[i].Length = (ui32)data2[i].size();
offset += (ui32)data2[i].size();
}
#ifndef NDEBUG
for (ui32 i = 0; i < nBuckets; ++i) {
for (size_t j = 0; j < data2[i].size(); ++j)
for (size_t k = j + 1; k < data2[i].size(); ++k)
if (TKeyValuePair::GetFirst(&data2[i][j]) == TKeyValuePair::GetFirst(&data2[i][k]))
ythrow yexception() << "key clash";
}
#endif
out.Write(intervals.data(), intervals.size() * sizeof(intervals[0]));
for (ui32 i = 0; i < nBuckets; ++i)
out.Write(data2[i].data(), data2[i].size() * sizeof(data2[i][0]));
}
};
template <typename TKey, typename TValue>
class TPlainHash : TPlainHashCommon {
private:
typedef TPackedPair<TKey, TValue> TKeyValuePair;
const char* P;
ui16 GetBits() const {
return ReadUnaligned<ui16>(P + 6);
}
ui32 GetSize() const {
return ReadUnaligned<ui32>(P + 8);
}
const TInterval* GetIntervals() const {
return (const TInterval*)(P + 12);
}
const TKeyValuePair* GetData() const {
return (const TKeyValuePair*)(GetIntervals() + (1ULL << GetBits()));
}
template <typename T>
void Init(const T* p) {
static_assert(sizeof(T) == 1, "expect sizeof(T) == 1");
P = reinterpret_cast<const char*>(p);
#ifndef NDEBUG
ui16 version = ReadUnaligned<ui16>(p);
if (version != VERSION_ID)
ythrow yexception() << "bad version: " << version;
static const ui32 PAIR_SIZE = sizeof(TKeyValuePair);
const ui32 size = ReadUnaligned<ui32>(p + 2);
if (size != PAIR_SIZE)
ythrow yexception() << "bad size " << size << " instead of " << PAIR_SIZE;
#endif
}
public:
typedef const TKeyValuePair* TConstIterator;
TPlainHash(const char* p) {
Init(p);
}
TPlainHash(const TBlob& blob) {
Init(blob.Begin());
}
bool Find(typename TTypeTraits<TKey>::TFuncParam key, TValue* res) const {
// Cerr << GetBits() << "\t" << (1 << GetBits()) << "\t" << GetSize() << Endl;
const ui32 hash = KeyHash<TKey>(key, GetBits());
const TInterval* intervalPtr = GetIntervals();
const TKeyValuePair* pair = GetData() + TInterval::GetOffset(intervalPtr + hash);
const ui32 length = TInterval::GetLength(intervalPtr + hash);
for (ui32 i = 0; i < length; ++i, ++pair) {
if (TKeyValuePair::GetFirst(pair) == key) {
*res = TKeyValuePair::GetSecond(pair);
return true;
}
}
return false;
}
TValue Get(typename TTypeTraits<TKey>::TFuncParam key) const {
TValue res;
if (Find(key, &res))
return res;
else
ythrow yexception() << "key not found";
}
TConstIterator Begin() const {
return GetData();
}
TConstIterator End() const {
return GetData() + GetSize();
}
const char* ByteEnd() const {
return (const char*)(GetData() + GetSize());
}
size_t ByteSize() const {
return 12 + sizeof(TInterval) * (size_t(1) << GetBits()) + sizeof(TKeyValuePair) * GetSize();
}
};
template <typename Key, typename Value, bool>
struct TPlainHashG;
template <typename Key, typename Value>
struct TPlainHashG<Key, Value, false> {
typedef TPlainHash<Key, Value> T;
};
template <typename Key, typename Value>
struct TPlainHashG<Key, Value, true> {
typedef TPlainHashWriter<Key, Value> T;
};
template <typename T>
class TSingleValue {
private:
const T* Value;
public:
TSingleValue(const TBlob& blob) {
Y_ASSERT(blob.Length() >= sizeof(T));
Y_ASSERT(blob.Length() <= sizeof(T) + 16);
Value = reinterpret_cast<const T*>(blob.Begin());
}
const T& Get() const {
return *Value;
}
};
template <typename T>
class TSingleValueWriter {
private:
T Value;
public:
TSingleValueWriter() = default;
TSingleValueWriter(const T& value)
: Value(value)
{
}
void Set(const T& value) {
Value = value;
}
void Save(IOutputStream& out) const {
out.Write(&Value, sizeof(Value));
}
};
TBlob GetBlock(const TBlob& data, size_t index);
template <class T>
void WriteBlock(TChunkedDataWriter& writer, const T& t) {
writer.NewBlock();
t.Save(writer);
}
template <class T>
void WriteBlock(TChunkedDataWriter& writer, T& t) {
writer.NewBlock();
t.Save(writer);
}
// Extends TChunkedDataWriter, allowing user to name blocks with arbitrary strings.
class TNamedChunkedDataWriter: public TChunkedDataWriter {
public:
TNamedChunkedDataWriter(IOutputStream& slave);
~TNamedChunkedDataWriter() override;
// Start a new unnamed block, overrides TChunkedDataReader::NewBlock().
void NewBlock();
// Start a new block with given name (possibly empty, in which case block is unnamed).
// Throws an exception if name is a duplicate.
void NewBlock(const TString& name);
void WriteFooter();
private:
TVector<TString> Names;
THashMap<TString, size_t> NameToIndex;
};
class TNamedChunkedDataReader: public TChunkedDataReader {
public:
TNamedChunkedDataReader(const TBlob& blob);
inline bool HasBlock(const char* name) const {
return NameToIndex.find(name) != NameToIndex.end();
}
inline size_t GetIndexByName(const char* name) const {
THashMap<TString, size_t>::const_iterator it = NameToIndex.find(name);
if (it == NameToIndex.end())
throw yexception() << "Block \"" << name << "\" is not found";
else
return it->second;
}
// Returns number of blocks written to the file by user of TNamedChunkedDataReader.
inline size_t GetBlocksCount() const {
// Last block is for internal usage
return TChunkedDataReader::GetBlocksCount() - 1;
}
inline const char* GetBlockName(size_t index) const {
Y_ASSERT(index < GetBlocksCount());
return Names[index].data();
}
inline const void* GetBlockByName(const char* name) const {
return GetBlock(GetIndexByName(name));
}
inline size_t GetBlockLenByName(const char* name) const {
return GetBlockLen(GetIndexByName(name));
}
inline TBlob GetBlobByName(const char* name) const {
size_t id = GetIndexByName(name);
return TBlob::NoCopy(GetBlock(id), GetBlockLen(id));
}
inline bool GetBlobByName(const char* name, TBlob& blob) const {
THashMap<TString, size_t>::const_iterator it = NameToIndex.find(name);
if (it == NameToIndex.end())
return false;
blob = TBlob::NoCopy(GetBlock(it->second), GetBlockLen(it->second));
return true;
}
private:
TVector<TString> Names;
THashMap<TString, size_t> NameToIndex;
};
template <class T>
struct TSaveLoadVectorNonPodElement {
static inline void Save(IOutputStream* out, const T& t) {
TSerializer<T>::Save(out, t);
}
static inline void Load(IInputStream* in, T& t, size_t elementSize) {
Y_ASSERT(elementSize > 0);
TSerializer<T>::Load(in, t);
}
};
template <class T, bool isPod>
class TVectorTakingIntoAccountThePodType {
private:
ui64 SizeofOffsets;
const ui64* Offsets;
const char* Data;
public:
TVectorTakingIntoAccountThePodType(const TBlob& blob) {
SizeofOffsets = ReadUnaligned<ui64>(blob.Begin());
Y_ASSERT(SizeofOffsets > 0);
Offsets = reinterpret_cast<const ui64*>(blob.Begin() + sizeof(ui64));
Data = reinterpret_cast<const char*>(blob.Begin() + sizeof(ui64) + SizeofOffsets * sizeof(ui64));
}
size_t GetSize() const {
return (size_t)(SizeofOffsets - 1);
}
size_t GetLength(ui64 index) const {
if (index + 1 >= SizeofOffsets)
ythrow yexception() << "bad offset";
return IntegerCast<size_t>(ReadUnaligned<ui64>(Offsets + index + 1) - ReadUnaligned<ui64>(Offsets + index));
}
void Get(ui64 index, T& t) const {
const size_t len = GetLength(index);
TMemoryInput input(Data + ReadUnaligned<ui64>(Offsets + index), len);
TSaveLoadVectorNonPodElement<T>::Load(&input, t, len);
}
T Get(ui64 index) const {
T ret;
Get(index, ret);
return ret;
}
size_t RealSize() const {
return sizeof(ui64) * (SizeofOffsets + 1) + ReadUnaligned<ui64>(Offsets + SizeofOffsets - 1);
}
};
template <class T, bool isPod>
class TVectorTakingIntoAccountThePodTypeWriter : TNonCopyable {
private:
typedef TVector<ui64> TOffsets;
TOffsets Offsets;
TBuffer Data;
TBufferOutput DataStream;
public:
TVectorTakingIntoAccountThePodTypeWriter()
: DataStream(Data)
{
}
void PushBack(const T& t) {
Offsets.push_back((ui64) Data.size());
TSaveLoadVectorNonPodElement<T>::Save(&DataStream, t);
}
size_t Size() const {
return Offsets.size();
}
void Save(IOutputStream& out) const {
ui64 sizeofOffsets = Offsets.size() + 1;
out.Write(&sizeofOffsets, sizeof(sizeofOffsets));
out.Write(Offsets.data(), Offsets.size() * sizeof(Offsets[0]));
ui64 lastOffset = (ui64) Data.size();
out.Write(&lastOffset, sizeof(lastOffset));
out.Write(Data.data(), Data.size());
}
};
template <class T>
class TVectorTakingIntoAccountThePodType<T, true>: public TYVector<T> {
public:
TVectorTakingIntoAccountThePodType(const TBlob& blob)
: TYVector<T>(blob)
{
}
};
template <class T>
class TVectorTakingIntoAccountThePodTypeWriter<T, true>: public TYVectorWriter<T> {
};
template <typename T>
class TGeneralVector: public TVectorTakingIntoAccountThePodType<T, TTypeTraits<T>::IsPod> {
typedef TVectorTakingIntoAccountThePodType<T, TTypeTraits<T>::IsPod> TBase;
public:
TGeneralVector(const TBlob& blob)
: TBase(blob)
{
}
};
template <typename T>
class TGeneralVectorWriter: public TVectorTakingIntoAccountThePodTypeWriter<T, TTypeTraits<T>::IsPod> {
};
template <typename TItem, bool>
struct TGeneralVectorG;
template <typename TItem>
struct TGeneralVectorG<TItem, false> {
typedef TGeneralVector<TItem> T;
};
template <typename TItem>
struct TGeneralVectorG<TItem, true> {
typedef TGeneralVectorWriter<TItem> T;
};
template <>
struct TSaveLoadVectorNonPodElement<TString> {
static inline void Save(IOutputStream* out, const TString& s) {
out->Write(s.data(), s.size() + 1);
}
static inline void Load(TMemoryInput* in, TString& s, size_t elementSize) {
Y_ASSERT(elementSize > 0 && in->Avail() >= elementSize);
s.assign(in->Buf(), elementSize - 1); /// excluding 0 at the end
}
};
template <bool G>
struct TStringsVectorG: public TGeneralVectorG<TString, G> {
};
using TStringsVector = TGeneralVector<TString>;
using TStringsVectorWriter = TGeneralVectorWriter<TString>;