diff options
author | monster <monster@ydb.tech> | 2022-07-07 14:41:37 +0300 |
---|---|---|
committer | monster <monster@ydb.tech> | 2022-07-07 14:41:37 +0300 |
commit | 06e5c21a835c0e923506c4ff27929f34e00761c2 (patch) | |
tree | 75efcbc6854ef9bd476eb8bf00cc5c900da436a2 /tools/archiver/main.cpp | |
parent | 03f024c4412e3aa613bb543cf1660176320ba8f4 (diff) | |
download | ydb-06e5c21a835c0e923506c4ff27929f34e00761c2.tar.gz |
fix ya.make
Diffstat (limited to 'tools/archiver/main.cpp')
-rw-r--r-- | tools/archiver/main.cpp | 702 |
1 files changed, 702 insertions, 0 deletions
diff --git a/tools/archiver/main.cpp b/tools/archiver/main.cpp new file mode 100644 index 0000000000..6cda54c1ea --- /dev/null +++ b/tools/archiver/main.cpp @@ -0,0 +1,702 @@ +#include <library/cpp/archive/yarchive.h> +#include <library/cpp/deprecated/mapped_file/mapped_file.h> +#include <library/cpp/digest/md5/md5.h> +#include <library/cpp/getopt/small/last_getopt.h> + +#include <util/folder/dirut.h> +#include <util/folder/filelist.h> +#include <util/folder/path.h> +#include <util/generic/vector.h> +#include <util/generic/yexception.h> +#include <util/memory/blob.h> +#include <util/stream/file.h> +#include <util/string/cast.h> +#include <util/string/escape.h> +#include <util/string/hex.h> +#include <util/string/subst.h> +#include <util/system/filemap.h> + +#include <cstring> + +namespace { + class TStringArrayOutput: public IOutputStream { + public: + TStringArrayOutput(IOutputStream* slave, size_t stride) + : Slave(*slave) + , Stride(stride) + { + Buf.reserve(stride); + } + void DoFinish() override { + WriteBuf(); + Flush(); + } + void DoWrite(const void* data, size_t len) override { + for (const char* p = (const char*)data; len > 0; ++p, --len) { + Buf.append(*p); + if (Buf.size() == Stride) + WriteBuf(); + } + } + + private: + void WriteBuf() { + Slave << '"' << Buf << "\",\n"sv; + Buf.clear(); + } + + private: + IOutputStream& Slave; + const size_t Stride; + TString Buf; + }; + + class THexOutput: public IOutputStream { + public: + inline THexOutput(IOutputStream* slave) + : Slave_(slave) + { + } + + ~THexOutput() override { + } + + inline IOutputStream* Slave() const noexcept { + return Slave_; + } + + private: + void DoFinish() override { + Slave_->Write('\n'); + Slave_->Flush(); + } + + void DoWrite(const void* data, size_t len) override { + const char* b = (const char*)data; + + while (len) { + const unsigned char c = *b; + char buf[12]; + char* tmp = buf; + + if (Count_ % Columns == 0) { + *tmp++ = ' '; + *tmp++ = ' '; + *tmp++ = ' '; + *tmp++ = ' '; + } + + if (Count_ && Count_ % Columns != 0) { + *tmp++ = ','; + *tmp++ = ' '; + } + + *tmp++ = '0'; + *tmp++ = 'x'; + tmp = HexEncode(&c, 1, tmp); + + if ((Count_ % Columns) == (Columns - 1)) { + *tmp++ = ','; + *tmp++ = '\n'; + } + + Slave_->Write(buf, tmp - buf); + + --len; + ++b; + ++Count_; + } + } + + private: + // width in source chars + static const size_t Columns = 10; + ui64 Count_ = 0; + IOutputStream* Slave_ = nullptr; + }; + + struct TYasmOutput: public IOutputStream { + inline TYasmOutput(IOutputStream* out, const TString& base) + : Out_(out) + , Base_(base) + { + *Out_ << "global " << Base_ << "\n"; + *Out_ << "global " << Base_ << "Size\n\nSECTION .rodata\n\n"; + *Out_ << Base_ << ":\n"; + } + + ~TYasmOutput() override { + } + + void DoFinish() override { + *Out_ << Base_ << "Size:\ndd " << Count_ << '\n'; + + *Out_ << "%ifidn __OUTPUT_FORMAT__,elf64\n"; + *Out_ << "size " << Base_ << " " << Count_ << "\n"; + *Out_ << "size " << Base_ << "Size 4\n"; + *Out_ << "%endif\n"; + } + + void DoWrite(const void* data, size_t len) override { + Count_ += len; + + const unsigned char* p = (const unsigned char*)data; + + while (len) { + const size_t step = Min<size_t>(len, 100); + + *Out_ << "db " << (int)*p++; + + for (size_t i = 1; i < step; ++i) { + *Out_ << ',' << (int)*p++; + } + + *Out_ << '\n'; + + len -= step; + } + } + + IOutputStream* Out_ = nullptr; + const TString Base_; + ui64 Count_ = 0; + }; + + struct TCOutput: public THexOutput { + inline TCOutput(IOutputStream* out, const TString& base) + : THexOutput(out) + , B(base) + { + *Slave() << "static_assert(sizeof(unsigned int) == 4, \"ups, unsupported platform\");\n\nextern \"C\" {\nextern const unsigned char " << B << "[] = {\n"; + } + + ~TCOutput() override { + } + + void DoFinish() override { + *Slave() << "\n};\nextern const unsigned int " << B << "Size = sizeof(" << B << ") / sizeof(" << B << "[0]);\n}\n"; + } + + const TString B; + }; + + struct TCStringOutput: public IOutputStream { + inline TCStringOutput(IOutputStream* out, const TString& base) + : O(out) + , B(base) + { + *O << "static_assert(sizeof(unsigned int) == 4, \"ups, unsupported platform\");\n\nextern \"C\" {\nextern const unsigned char " << B << "[] = \n"; + } + + ~TCStringOutput() override { + } + + void DoWrite(const void* data, size_t len) override { + *O << TString((const char*)data, len).Quote() << '\n'; + } + + void DoFinish() override { + //*O << ";\nextern const unsigned char* " << B << " = (const unsigned char*)" << B << "Array;\n"; + *O << ";\nextern const unsigned int " << B << "Size = sizeof(" << B << ") / sizeof(" << B << "[0]) - 1;\n}\n"; + } + + IOutputStream* O = nullptr; + const TString B; + }; + + struct TMyFileComparator { + bool operator()(const TString& fname1, const TString& fname2) const { + if (fname1 == fname2) { + return false; + } + if (const auto* savedResultPtr = SavedResults.FindPtr(std::make_pair(fname1, fname2))) { + return *savedResultPtr < 0; + } + TMemoryMap mmap1(fname1, TMemoryMap::oRdOnly); + TMemoryMap mmap2(fname2, TMemoryMap::oRdOnly); + mmap1.SetSequential(); + mmap2.SetSequential(); + Y_ASSERT(mmap1.Length() == mmap2.Length()); + TMemoryMap::TMapResult mapResult1 = mmap1.Map(0, mmap1.Length()); + TMemoryMap::TMapResult mapResult2 = mmap2.Map(0, mmap2.Length()); + Y_ASSERT(mapResult1.MappedSize() == mapResult2.MappedSize()); + int res = memcmp(mapResult1.MappedData(), mapResult2.MappedData(), mapResult1.MappedSize()); + mmap1.Unmap(mapResult1); + mmap2.Unmap(mapResult2); + SavedResults[std::make_pair(fname1, fname2)] = res; + SavedResults[std::make_pair(fname2, fname1)] = -res; + return res < 0; + } + + mutable THashMap<std::pair<TString, TString>, int> SavedResults; + }; + + struct TDuplicatesMap { + void Add(const TString& fname, const TString& rname) { + Y_ENSURE(!InitialFillingDone); + FileNames.push_back(fname); + FileNameToRecordName[fname] = rname; + } + + void Finish() { + Y_ENSURE(!InitialFillingDone); + InitialFillingDone = true; + TMap<i64, TVector<TString>> bySize; + for (const TString& fname: FileNames) { + TFile file(fname, OpenExisting | RdOnly); + bySize[file.GetLength()].push_back(fname); + } + for (const auto& bySizeElement: bySize) { + if (bySizeElement.second.size() > 1) { + TMap<TString, TVector<TString>, TMyFileComparator> byContents; + for (const TString& fname: bySizeElement.second) { + byContents[fname].push_back(fname); + } + for (const auto& byContentsElement: byContents) { + if (byContentsElement.second.size() > 1) { + const TString& rootName = byContentsElement.second.front(); + const TString& rootRecordName = FileNameToRecordName[rootName]; + for (const TString& fname: byContentsElement.second) { + if (fname != rootName) { + Synonyms[FileNameToRecordName[fname]] = rootRecordName; + } + } + } + } + } + } + FileNames.clear(); + FileNameToRecordName.clear(); + } + + bool InitialFillingDone = false; + TVector<TString> FileNames; + THashMap<TString, TString> FileNameToRecordName; + THashMap<TString, TString> Synonyms; + }; + + struct TDeduplicationArchiveWriter { + TDeduplicationArchiveWriter(const TDuplicatesMap& duplicatesMap, IOutputStream* out, bool compress) + : DuplicatesMap(duplicatesMap) + , Writer(out, compress) + {} + + void Finish() { + Writer.Finish(); + } + + const TDuplicatesMap& DuplicatesMap; + TArchiveWriter Writer; + }; +} + +static inline TAutoPtr<IOutputStream> OpenOutput(const TString& url) { + if (url.empty()) { + return new TBuffered<TUnbufferedFileOutput>(8192, Duplicate(1)); + } else { + return new TBuffered<TUnbufferedFileOutput>(8192, url); + } +} + +static inline bool IsDelim(char ch) noexcept { + return ch == '/' || ch == '\\'; +} + +static inline TString GetFile(const TString& s) { + const char* e = s.end(); + const char* b = s.begin(); + const char* c = e - 1; + + while (c != b && !IsDelim(*c)) { + --c; + } + + if (c != e && IsDelim(*c)) { + ++c; + } + + return TString(c, e - c); +} + +static inline TString Fix(TString f) { + if (!f.empty() && IsDelim(f[f.size() - 1])) { + f.pop_back(); + } + + return f; +} + +static bool Quiet = false; + +static inline void Append(IOutputStream& w, const TString& fname, const TString& rname) { + TMappedFileInput in(fname); + + if (!Quiet) { + Cerr << "--> " << rname << Endl; + } + + TransferData((IInputStream*)&in, &w); +} + +static inline void Append(TDuplicatesMap& w, const TString& fname, const TString& rname) { + w.Add(fname, rname); +} + +static inline void Append(TDeduplicationArchiveWriter& w, const TString& fname, const TString& rname) { + if (!Quiet) { + Cerr << "--> " << rname << Endl; + } + + if (const TString* rootRecordName = w.DuplicatesMap.Synonyms.FindPtr(rname)) { + w.Writer.AddSynonym(*rootRecordName, rname); + } else { + TMappedFileInput in(fname); + w.Writer.Add(rname, &in); + } +} + +namespace { + struct TRec { + bool Recursive = false; + TString Key; + TString Path; + TString Prefix; + + TRec() = default; + + inline void Fix() { + ::Fix(Path); + ::Fix(Prefix); + } + + template <typename T> + inline void Recurse(T& w) const { + if (IsDir(Path)) { + DoRecurse(w, "/"); + } else { + Append(w, Path, Key.size() ? Key : Prefix + "/" + GetFile(Path)); + } + } + + template <typename T> + inline void DoRecurse(T& w, const TString& off) const { + { + TFileList fl; + + const char* name; + const TString p = Path + off; + + fl.Fill(p, true); + + while ((name = fl.Next())) { + const TString fname = p + name; + const TString rname = Prefix + off + name; + + Append(w, fname, rname); + } + } + + if (Recursive) { + TDirsList dl; + + const char* name; + const TString p = Path + off; + + dl.Fill(p, true); + + while ((name = dl.Next())) { + if (strcmp(name, ".") && strcmp(name, "..")) { + DoRecurse(w, off + name + "/"); + } + } + } + } + }; +} + +static TString CutFirstSlash(const TString& fileName) { + if (fileName[0] == '/') { + return fileName.substr(1); + } else { + return fileName; + } +} + +struct TMappingReader { + TMemoryMap Map; + TBlob Blob; + TArchiveReader Reader; + + TMappingReader(const TString& archive) + : Map(archive) + , Blob(TBlob::FromMemoryMapSingleThreaded(Map, 0, Map.Length())) + , Reader(Blob) + { + } +}; + +static void UnpackArchive(const TString& archive, const TFsPath& dir = TFsPath()) { + TMappingReader mappingReader(archive); + const TArchiveReader& reader = mappingReader.Reader; + const size_t count = reader.Count(); + for (size_t i = 0; i < count; ++i) { + const TString key = reader.KeyByIndex(i); + const TString fileName = CutFirstSlash(key); + if (!Quiet) { + Cerr << archive << " --> " << fileName << Endl; + } + const TFsPath path(dir / fileName); + path.Parent().MkDirs(); + TAutoPtr<IInputStream> in = reader.ObjectByKey(key); + TFixedBufferFileOutput out(path); + TransferData(in.Get(), &out); + out.Finish(); + } +} + +static void ListArchive(const TString& archive, bool cutSlash) { + TMappingReader mappingReader(archive); + const TArchiveReader& reader = mappingReader.Reader; + const size_t count = reader.Count(); + for (size_t i = 0; i < count; ++i) { + const TString key = reader.KeyByIndex(i); + TString fileName = key; + if (cutSlash) { + fileName = CutFirstSlash(key); + } + Cout << fileName << Endl; + } +} + +static void ListArchiveMd5(const TString& archive, bool cutSlash) { + TMappingReader mappingReader(archive); + const TArchiveReader& reader = mappingReader.Reader; + const size_t count = reader.Count(); + for (size_t i = 0; i < count; ++i) { + const TString key = reader.KeyByIndex(i); + TString fileName = key; + if (cutSlash) { + fileName = CutFirstSlash(key); + } + char md5buf[33]; + Cout << fileName << '\t' << MD5::Stream(reader.ObjectByKey(key).Get(), md5buf) << Endl; + } +} + +int main(int argc, char** argv) { + NLastGetopt::TOpts opts; + opts.AddHelpOption('?'); + opts.SetTitle( + "Archiver\n" + "Docs: https://wiki.yandex-team.ru/Development/Poisk/arcadia/tools/archiver" + ); + + bool hexdump = false; + opts.AddLongOption('x', "hexdump", "Produce hexdump") + .NoArgument() + .Optional() + .StoreValue(&hexdump, true); + + size_t stride = 0; + opts.AddLongOption('s', "segments", "Produce segmented C strings array of given size") + .RequiredArgument("<size>") + .Optional() + .DefaultValue("0") + .StoreResult(&stride); + + bool cat = false; + opts.AddLongOption('c', "cat", "Do not store keys (file names), just cat uncompressed files") + .NoArgument() + .Optional() + .StoreValue(&cat, true); + + bool doNotZip = false; + opts.AddLongOption('p', "plain", "Do not use compression") + .NoArgument() + .Optional() + .StoreValue(&doNotZip, true); + + bool deduplicate = false; + opts.AddLongOption("deduplicate", "Turn on file-wise deduplication") + .NoArgument() + .Optional() + .StoreValue(&deduplicate, true); + + bool unpack = false; + opts.AddLongOption('u', "unpack", "Unpack archive into current directory") + .NoArgument() + .Optional() + .StoreValue(&unpack, true); + + bool list = false; + opts.AddLongOption('l', "list", "List files in archive") + .NoArgument() + .Optional() + .StoreValue(&list, true); + + bool cutSlash = true; + opts.AddLongOption("as-is", "somewhy slash is cutted by default in list; with this option key will be shown as-is") + .NoArgument() + .Optional() + .StoreValue(&cutSlash, false); + + bool listMd5 = false; + opts.AddLongOption('m', "md5", "List files in archive with MD5 sums") + .NoArgument() + .Optional() + .StoreValue(&listMd5, true); + + bool recursive = false; + opts.AddLongOption('r', "recursive", "Read all files under each directory, recursively") + .NoArgument() + .Optional() + .StoreValue(&recursive, true); + + Quiet = false; + opts.AddLongOption('q', "quiet", "Do not output progress to stderr") + .NoArgument() + .Optional() + .StoreValue(&Quiet, true); + + TString prepend; + opts.AddLongOption('z', "prepend", "Prepend string to output") + .RequiredArgument("<prefix>") + .StoreResult(&prepend); + + TString append; + opts.AddLongOption('a', "append", "Append string to output") + .RequiredArgument("<suffix>") + .StoreResult(&append); + + TString outputf; + opts.AddLongOption('o', "output", "Output to file instead stdout") + .RequiredArgument("<file>") + .StoreResult(&outputf); + + TString unpackDir; + opts.AddLongOption('d', "unpackdir", "Unpack destination directory") + .RequiredArgument("<dir>") + .DefaultValue(".") + .StoreResult(&unpackDir); + + TString yasmBase; + opts.AddLongOption('A', "yasm", "Output dump is yasm format") + .RequiredArgument("<base>") + .StoreResult(&yasmBase); + + TString cppBase; + opts.AddLongOption('C', "cpp", "Output dump is C/C++ format") + .RequiredArgument("<base>") + .StoreResult(&cppBase); + + TString forceKeys; + opts.AddLongOption('k', "keys", "Set explicit list of keys for elements") + .RequiredArgument("<keys>") + .StoreResult(&forceKeys); + + opts.SetFreeArgDefaultTitle("<file>"); + opts.SetFreeArgsMin(1); + NLastGetopt::TOptsParseResult optsRes(&opts, argc, argv); + + SubstGlobal(append, "\\n", "\n"); + SubstGlobal(prepend, "\\n", "\n"); + + TVector<TRec> recs; + const auto& files = optsRes.GetFreeArgs(); + + TVector<TStringBuf> keys; + if (forceKeys.size()) + StringSplitter(forceKeys).Split(':').SkipEmpty().Collect(&keys); + + if (keys.size() && keys.size() != files.size()) { + Cerr << "Invalid number of keys=" << keys.size() << " (!= number of files=" << files.size() << ")" << Endl; + return 1; + } + + for (size_t i = 0; i < files.size(); ++i) { + const auto& path = files[i]; + size_t off = 0; +#ifdef _win_ + if (path[0] > 0 && isalpha(path[0]) && path[1] == ':') + off = 2; // skip drive letter ("d:") +#endif // _win_ + const size_t pos = path.find(':', off); + TRec cur; + cur.Path = path.substr(0, pos); + if (pos != TString::npos) + cur.Prefix = path.substr(pos + 1); + if (keys.size()) + cur.Key = keys[i]; + cur.Recursive = recursive; + cur.Fix(); + recs.push_back(cur); + } + + try { + if (listMd5) { + for (const auto& rec: recs) { + ListArchiveMd5(rec.Path, cutSlash); + } + } else if (list) { + for (const auto& rec: recs) { + ListArchive(rec.Path, cutSlash); + } + } else if (unpack) { + const TFsPath dir(unpackDir); + for (const auto& rec: recs) { + UnpackArchive(rec.Path, dir); + } + } else { + TAutoPtr<IOutputStream> outf(OpenOutput(outputf)); + IOutputStream* out = outf.Get(); + THolder<IOutputStream> hexout; + + if (hexdump) { + hexout.Reset(new THexOutput(out)); + out = hexout.Get(); + } else if (stride) { + hexout.Reset(new TStringArrayOutput(out, stride)); + out = hexout.Get(); + } else if (yasmBase) { + hexout.Reset(new TYasmOutput(out, yasmBase)); + out = hexout.Get(); + } else if (cppBase) { + hexout.Reset(new TCStringOutput(out, cppBase)); + out = hexout.Get(); + } + + outf->Write(prepend.data(), prepend.size()); + + if (cat) { + for (const auto& rec: recs) { + rec.Recurse(*out); + } + } else { + TDuplicatesMap duplicatesMap; + if (deduplicate) { + for (const auto& rec: recs) { + rec.Recurse(duplicatesMap); + } + } + duplicatesMap.Finish(); + TDeduplicationArchiveWriter w(duplicatesMap, out, !doNotZip); + for (const auto& rec: recs) { + rec.Recurse(w); + } + w.Finish(); + } + + try { + out->Finish(); + } catch (...) { + } + + outf->Write(append.data(), append.size()); + } + } catch (...) { + Cerr << CurrentExceptionMessage() << Endl; + return 1; + } + + return 0; +} |