aboutsummaryrefslogtreecommitdiffstats
path: root/tools/archiver/main.cpp
diff options
context:
space:
mode:
authormonster <monster@ydb.tech>2022-07-07 14:41:37 +0300
committermonster <monster@ydb.tech>2022-07-07 14:41:37 +0300
commit06e5c21a835c0e923506c4ff27929f34e00761c2 (patch)
tree75efcbc6854ef9bd476eb8bf00cc5c900da436a2 /tools/archiver/main.cpp
parent03f024c4412e3aa613bb543cf1660176320ba8f4 (diff)
downloadydb-06e5c21a835c0e923506c4ff27929f34e00761c2.tar.gz
fix ya.make
Diffstat (limited to 'tools/archiver/main.cpp')
-rw-r--r--tools/archiver/main.cpp702
1 files changed, 702 insertions, 0 deletions
diff --git a/tools/archiver/main.cpp b/tools/archiver/main.cpp
new file mode 100644
index 0000000000..6cda54c1ea
--- /dev/null
+++ b/tools/archiver/main.cpp
@@ -0,0 +1,702 @@
+#include <library/cpp/archive/yarchive.h>
+#include <library/cpp/deprecated/mapped_file/mapped_file.h>
+#include <library/cpp/digest/md5/md5.h>
+#include <library/cpp/getopt/small/last_getopt.h>
+
+#include <util/folder/dirut.h>
+#include <util/folder/filelist.h>
+#include <util/folder/path.h>
+#include <util/generic/vector.h>
+#include <util/generic/yexception.h>
+#include <util/memory/blob.h>
+#include <util/stream/file.h>
+#include <util/string/cast.h>
+#include <util/string/escape.h>
+#include <util/string/hex.h>
+#include <util/string/subst.h>
+#include <util/system/filemap.h>
+
+#include <cstring>
+
+namespace {
+ class TStringArrayOutput: public IOutputStream {
+ public:
+ TStringArrayOutput(IOutputStream* slave, size_t stride)
+ : Slave(*slave)
+ , Stride(stride)
+ {
+ Buf.reserve(stride);
+ }
+ void DoFinish() override {
+ WriteBuf();
+ Flush();
+ }
+ void DoWrite(const void* data, size_t len) override {
+ for (const char* p = (const char*)data; len > 0; ++p, --len) {
+ Buf.append(*p);
+ if (Buf.size() == Stride)
+ WriteBuf();
+ }
+ }
+
+ private:
+ void WriteBuf() {
+ Slave << '"' << Buf << "\",\n"sv;
+ Buf.clear();
+ }
+
+ private:
+ IOutputStream& Slave;
+ const size_t Stride;
+ TString Buf;
+ };
+
+ class THexOutput: public IOutputStream {
+ public:
+ inline THexOutput(IOutputStream* slave)
+ : Slave_(slave)
+ {
+ }
+
+ ~THexOutput() override {
+ }
+
+ inline IOutputStream* Slave() const noexcept {
+ return Slave_;
+ }
+
+ private:
+ void DoFinish() override {
+ Slave_->Write('\n');
+ Slave_->Flush();
+ }
+
+ void DoWrite(const void* data, size_t len) override {
+ const char* b = (const char*)data;
+
+ while (len) {
+ const unsigned char c = *b;
+ char buf[12];
+ char* tmp = buf;
+
+ if (Count_ % Columns == 0) {
+ *tmp++ = ' ';
+ *tmp++ = ' ';
+ *tmp++ = ' ';
+ *tmp++ = ' ';
+ }
+
+ if (Count_ && Count_ % Columns != 0) {
+ *tmp++ = ',';
+ *tmp++ = ' ';
+ }
+
+ *tmp++ = '0';
+ *tmp++ = 'x';
+ tmp = HexEncode(&c, 1, tmp);
+
+ if ((Count_ % Columns) == (Columns - 1)) {
+ *tmp++ = ',';
+ *tmp++ = '\n';
+ }
+
+ Slave_->Write(buf, tmp - buf);
+
+ --len;
+ ++b;
+ ++Count_;
+ }
+ }
+
+ private:
+ // width in source chars
+ static const size_t Columns = 10;
+ ui64 Count_ = 0;
+ IOutputStream* Slave_ = nullptr;
+ };
+
+ struct TYasmOutput: public IOutputStream {
+ inline TYasmOutput(IOutputStream* out, const TString& base)
+ : Out_(out)
+ , Base_(base)
+ {
+ *Out_ << "global " << Base_ << "\n";
+ *Out_ << "global " << Base_ << "Size\n\nSECTION .rodata\n\n";
+ *Out_ << Base_ << ":\n";
+ }
+
+ ~TYasmOutput() override {
+ }
+
+ void DoFinish() override {
+ *Out_ << Base_ << "Size:\ndd " << Count_ << '\n';
+
+ *Out_ << "%ifidn __OUTPUT_FORMAT__,elf64\n";
+ *Out_ << "size " << Base_ << " " << Count_ << "\n";
+ *Out_ << "size " << Base_ << "Size 4\n";
+ *Out_ << "%endif\n";
+ }
+
+ void DoWrite(const void* data, size_t len) override {
+ Count_ += len;
+
+ const unsigned char* p = (const unsigned char*)data;
+
+ while (len) {
+ const size_t step = Min<size_t>(len, 100);
+
+ *Out_ << "db " << (int)*p++;
+
+ for (size_t i = 1; i < step; ++i) {
+ *Out_ << ',' << (int)*p++;
+ }
+
+ *Out_ << '\n';
+
+ len -= step;
+ }
+ }
+
+ IOutputStream* Out_ = nullptr;
+ const TString Base_;
+ ui64 Count_ = 0;
+ };
+
+ struct TCOutput: public THexOutput {
+ inline TCOutput(IOutputStream* out, const TString& base)
+ : THexOutput(out)
+ , B(base)
+ {
+ *Slave() << "static_assert(sizeof(unsigned int) == 4, \"ups, unsupported platform\");\n\nextern \"C\" {\nextern const unsigned char " << B << "[] = {\n";
+ }
+
+ ~TCOutput() override {
+ }
+
+ void DoFinish() override {
+ *Slave() << "\n};\nextern const unsigned int " << B << "Size = sizeof(" << B << ") / sizeof(" << B << "[0]);\n}\n";
+ }
+
+ const TString B;
+ };
+
+ struct TCStringOutput: public IOutputStream {
+ inline TCStringOutput(IOutputStream* out, const TString& base)
+ : O(out)
+ , B(base)
+ {
+ *O << "static_assert(sizeof(unsigned int) == 4, \"ups, unsupported platform\");\n\nextern \"C\" {\nextern const unsigned char " << B << "[] = \n";
+ }
+
+ ~TCStringOutput() override {
+ }
+
+ void DoWrite(const void* data, size_t len) override {
+ *O << TString((const char*)data, len).Quote() << '\n';
+ }
+
+ void DoFinish() override {
+ //*O << ";\nextern const unsigned char* " << B << " = (const unsigned char*)" << B << "Array;\n";
+ *O << ";\nextern const unsigned int " << B << "Size = sizeof(" << B << ") / sizeof(" << B << "[0]) - 1;\n}\n";
+ }
+
+ IOutputStream* O = nullptr;
+ const TString B;
+ };
+
+ struct TMyFileComparator {
+ bool operator()(const TString& fname1, const TString& fname2) const {
+ if (fname1 == fname2) {
+ return false;
+ }
+ if (const auto* savedResultPtr = SavedResults.FindPtr(std::make_pair(fname1, fname2))) {
+ return *savedResultPtr < 0;
+ }
+ TMemoryMap mmap1(fname1, TMemoryMap::oRdOnly);
+ TMemoryMap mmap2(fname2, TMemoryMap::oRdOnly);
+ mmap1.SetSequential();
+ mmap2.SetSequential();
+ Y_ASSERT(mmap1.Length() == mmap2.Length());
+ TMemoryMap::TMapResult mapResult1 = mmap1.Map(0, mmap1.Length());
+ TMemoryMap::TMapResult mapResult2 = mmap2.Map(0, mmap2.Length());
+ Y_ASSERT(mapResult1.MappedSize() == mapResult2.MappedSize());
+ int res = memcmp(mapResult1.MappedData(), mapResult2.MappedData(), mapResult1.MappedSize());
+ mmap1.Unmap(mapResult1);
+ mmap2.Unmap(mapResult2);
+ SavedResults[std::make_pair(fname1, fname2)] = res;
+ SavedResults[std::make_pair(fname2, fname1)] = -res;
+ return res < 0;
+ }
+
+ mutable THashMap<std::pair<TString, TString>, int> SavedResults;
+ };
+
+ struct TDuplicatesMap {
+ void Add(const TString& fname, const TString& rname) {
+ Y_ENSURE(!InitialFillingDone);
+ FileNames.push_back(fname);
+ FileNameToRecordName[fname] = rname;
+ }
+
+ void Finish() {
+ Y_ENSURE(!InitialFillingDone);
+ InitialFillingDone = true;
+ TMap<i64, TVector<TString>> bySize;
+ for (const TString& fname: FileNames) {
+ TFile file(fname, OpenExisting | RdOnly);
+ bySize[file.GetLength()].push_back(fname);
+ }
+ for (const auto& bySizeElement: bySize) {
+ if (bySizeElement.second.size() > 1) {
+ TMap<TString, TVector<TString>, TMyFileComparator> byContents;
+ for (const TString& fname: bySizeElement.second) {
+ byContents[fname].push_back(fname);
+ }
+ for (const auto& byContentsElement: byContents) {
+ if (byContentsElement.second.size() > 1) {
+ const TString& rootName = byContentsElement.second.front();
+ const TString& rootRecordName = FileNameToRecordName[rootName];
+ for (const TString& fname: byContentsElement.second) {
+ if (fname != rootName) {
+ Synonyms[FileNameToRecordName[fname]] = rootRecordName;
+ }
+ }
+ }
+ }
+ }
+ }
+ FileNames.clear();
+ FileNameToRecordName.clear();
+ }
+
+ bool InitialFillingDone = false;
+ TVector<TString> FileNames;
+ THashMap<TString, TString> FileNameToRecordName;
+ THashMap<TString, TString> Synonyms;
+ };
+
+ struct TDeduplicationArchiveWriter {
+ TDeduplicationArchiveWriter(const TDuplicatesMap& duplicatesMap, IOutputStream* out, bool compress)
+ : DuplicatesMap(duplicatesMap)
+ , Writer(out, compress)
+ {}
+
+ void Finish() {
+ Writer.Finish();
+ }
+
+ const TDuplicatesMap& DuplicatesMap;
+ TArchiveWriter Writer;
+ };
+}
+
+static inline TAutoPtr<IOutputStream> OpenOutput(const TString& url) {
+ if (url.empty()) {
+ return new TBuffered<TUnbufferedFileOutput>(8192, Duplicate(1));
+ } else {
+ return new TBuffered<TUnbufferedFileOutput>(8192, url);
+ }
+}
+
+static inline bool IsDelim(char ch) noexcept {
+ return ch == '/' || ch == '\\';
+}
+
+static inline TString GetFile(const TString& s) {
+ const char* e = s.end();
+ const char* b = s.begin();
+ const char* c = e - 1;
+
+ while (c != b && !IsDelim(*c)) {
+ --c;
+ }
+
+ if (c != e && IsDelim(*c)) {
+ ++c;
+ }
+
+ return TString(c, e - c);
+}
+
+static inline TString Fix(TString f) {
+ if (!f.empty() && IsDelim(f[f.size() - 1])) {
+ f.pop_back();
+ }
+
+ return f;
+}
+
+static bool Quiet = false;
+
+static inline void Append(IOutputStream& w, const TString& fname, const TString& rname) {
+ TMappedFileInput in(fname);
+
+ if (!Quiet) {
+ Cerr << "--> " << rname << Endl;
+ }
+
+ TransferData((IInputStream*)&in, &w);
+}
+
+static inline void Append(TDuplicatesMap& w, const TString& fname, const TString& rname) {
+ w.Add(fname, rname);
+}
+
+static inline void Append(TDeduplicationArchiveWriter& w, const TString& fname, const TString& rname) {
+ if (!Quiet) {
+ Cerr << "--> " << rname << Endl;
+ }
+
+ if (const TString* rootRecordName = w.DuplicatesMap.Synonyms.FindPtr(rname)) {
+ w.Writer.AddSynonym(*rootRecordName, rname);
+ } else {
+ TMappedFileInput in(fname);
+ w.Writer.Add(rname, &in);
+ }
+}
+
+namespace {
+ struct TRec {
+ bool Recursive = false;
+ TString Key;
+ TString Path;
+ TString Prefix;
+
+ TRec() = default;
+
+ inline void Fix() {
+ ::Fix(Path);
+ ::Fix(Prefix);
+ }
+
+ template <typename T>
+ inline void Recurse(T& w) const {
+ if (IsDir(Path)) {
+ DoRecurse(w, "/");
+ } else {
+ Append(w, Path, Key.size() ? Key : Prefix + "/" + GetFile(Path));
+ }
+ }
+
+ template <typename T>
+ inline void DoRecurse(T& w, const TString& off) const {
+ {
+ TFileList fl;
+
+ const char* name;
+ const TString p = Path + off;
+
+ fl.Fill(p, true);
+
+ while ((name = fl.Next())) {
+ const TString fname = p + name;
+ const TString rname = Prefix + off + name;
+
+ Append(w, fname, rname);
+ }
+ }
+
+ if (Recursive) {
+ TDirsList dl;
+
+ const char* name;
+ const TString p = Path + off;
+
+ dl.Fill(p, true);
+
+ while ((name = dl.Next())) {
+ if (strcmp(name, ".") && strcmp(name, "..")) {
+ DoRecurse(w, off + name + "/");
+ }
+ }
+ }
+ }
+ };
+}
+
+static TString CutFirstSlash(const TString& fileName) {
+ if (fileName[0] == '/') {
+ return fileName.substr(1);
+ } else {
+ return fileName;
+ }
+}
+
+struct TMappingReader {
+ TMemoryMap Map;
+ TBlob Blob;
+ TArchiveReader Reader;
+
+ TMappingReader(const TString& archive)
+ : Map(archive)
+ , Blob(TBlob::FromMemoryMapSingleThreaded(Map, 0, Map.Length()))
+ , Reader(Blob)
+ {
+ }
+};
+
+static void UnpackArchive(const TString& archive, const TFsPath& dir = TFsPath()) {
+ TMappingReader mappingReader(archive);
+ const TArchiveReader& reader = mappingReader.Reader;
+ const size_t count = reader.Count();
+ for (size_t i = 0; i < count; ++i) {
+ const TString key = reader.KeyByIndex(i);
+ const TString fileName = CutFirstSlash(key);
+ if (!Quiet) {
+ Cerr << archive << " --> " << fileName << Endl;
+ }
+ const TFsPath path(dir / fileName);
+ path.Parent().MkDirs();
+ TAutoPtr<IInputStream> in = reader.ObjectByKey(key);
+ TFixedBufferFileOutput out(path);
+ TransferData(in.Get(), &out);
+ out.Finish();
+ }
+}
+
+static void ListArchive(const TString& archive, bool cutSlash) {
+ TMappingReader mappingReader(archive);
+ const TArchiveReader& reader = mappingReader.Reader;
+ const size_t count = reader.Count();
+ for (size_t i = 0; i < count; ++i) {
+ const TString key = reader.KeyByIndex(i);
+ TString fileName = key;
+ if (cutSlash) {
+ fileName = CutFirstSlash(key);
+ }
+ Cout << fileName << Endl;
+ }
+}
+
+static void ListArchiveMd5(const TString& archive, bool cutSlash) {
+ TMappingReader mappingReader(archive);
+ const TArchiveReader& reader = mappingReader.Reader;
+ const size_t count = reader.Count();
+ for (size_t i = 0; i < count; ++i) {
+ const TString key = reader.KeyByIndex(i);
+ TString fileName = key;
+ if (cutSlash) {
+ fileName = CutFirstSlash(key);
+ }
+ char md5buf[33];
+ Cout << fileName << '\t' << MD5::Stream(reader.ObjectByKey(key).Get(), md5buf) << Endl;
+ }
+}
+
+int main(int argc, char** argv) {
+ NLastGetopt::TOpts opts;
+ opts.AddHelpOption('?');
+ opts.SetTitle(
+ "Archiver\n"
+ "Docs: https://wiki.yandex-team.ru/Development/Poisk/arcadia/tools/archiver"
+ );
+
+ bool hexdump = false;
+ opts.AddLongOption('x', "hexdump", "Produce hexdump")
+ .NoArgument()
+ .Optional()
+ .StoreValue(&hexdump, true);
+
+ size_t stride = 0;
+ opts.AddLongOption('s', "segments", "Produce segmented C strings array of given size")
+ .RequiredArgument("<size>")
+ .Optional()
+ .DefaultValue("0")
+ .StoreResult(&stride);
+
+ bool cat = false;
+ opts.AddLongOption('c', "cat", "Do not store keys (file names), just cat uncompressed files")
+ .NoArgument()
+ .Optional()
+ .StoreValue(&cat, true);
+
+ bool doNotZip = false;
+ opts.AddLongOption('p', "plain", "Do not use compression")
+ .NoArgument()
+ .Optional()
+ .StoreValue(&doNotZip, true);
+
+ bool deduplicate = false;
+ opts.AddLongOption("deduplicate", "Turn on file-wise deduplication")
+ .NoArgument()
+ .Optional()
+ .StoreValue(&deduplicate, true);
+
+ bool unpack = false;
+ opts.AddLongOption('u', "unpack", "Unpack archive into current directory")
+ .NoArgument()
+ .Optional()
+ .StoreValue(&unpack, true);
+
+ bool list = false;
+ opts.AddLongOption('l', "list", "List files in archive")
+ .NoArgument()
+ .Optional()
+ .StoreValue(&list, true);
+
+ bool cutSlash = true;
+ opts.AddLongOption("as-is", "somewhy slash is cutted by default in list; with this option key will be shown as-is")
+ .NoArgument()
+ .Optional()
+ .StoreValue(&cutSlash, false);
+
+ bool listMd5 = false;
+ opts.AddLongOption('m', "md5", "List files in archive with MD5 sums")
+ .NoArgument()
+ .Optional()
+ .StoreValue(&listMd5, true);
+
+ bool recursive = false;
+ opts.AddLongOption('r', "recursive", "Read all files under each directory, recursively")
+ .NoArgument()
+ .Optional()
+ .StoreValue(&recursive, true);
+
+ Quiet = false;
+ opts.AddLongOption('q', "quiet", "Do not output progress to stderr")
+ .NoArgument()
+ .Optional()
+ .StoreValue(&Quiet, true);
+
+ TString prepend;
+ opts.AddLongOption('z', "prepend", "Prepend string to output")
+ .RequiredArgument("<prefix>")
+ .StoreResult(&prepend);
+
+ TString append;
+ opts.AddLongOption('a', "append", "Append string to output")
+ .RequiredArgument("<suffix>")
+ .StoreResult(&append);
+
+ TString outputf;
+ opts.AddLongOption('o', "output", "Output to file instead stdout")
+ .RequiredArgument("<file>")
+ .StoreResult(&outputf);
+
+ TString unpackDir;
+ opts.AddLongOption('d', "unpackdir", "Unpack destination directory")
+ .RequiredArgument("<dir>")
+ .DefaultValue(".")
+ .StoreResult(&unpackDir);
+
+ TString yasmBase;
+ opts.AddLongOption('A', "yasm", "Output dump is yasm format")
+ .RequiredArgument("<base>")
+ .StoreResult(&yasmBase);
+
+ TString cppBase;
+ opts.AddLongOption('C', "cpp", "Output dump is C/C++ format")
+ .RequiredArgument("<base>")
+ .StoreResult(&cppBase);
+
+ TString forceKeys;
+ opts.AddLongOption('k', "keys", "Set explicit list of keys for elements")
+ .RequiredArgument("<keys>")
+ .StoreResult(&forceKeys);
+
+ opts.SetFreeArgDefaultTitle("<file>");
+ opts.SetFreeArgsMin(1);
+ NLastGetopt::TOptsParseResult optsRes(&opts, argc, argv);
+
+ SubstGlobal(append, "\\n", "\n");
+ SubstGlobal(prepend, "\\n", "\n");
+
+ TVector<TRec> recs;
+ const auto& files = optsRes.GetFreeArgs();
+
+ TVector<TStringBuf> keys;
+ if (forceKeys.size())
+ StringSplitter(forceKeys).Split(':').SkipEmpty().Collect(&keys);
+
+ if (keys.size() && keys.size() != files.size()) {
+ Cerr << "Invalid number of keys=" << keys.size() << " (!= number of files=" << files.size() << ")" << Endl;
+ return 1;
+ }
+
+ for (size_t i = 0; i < files.size(); ++i) {
+ const auto& path = files[i];
+ size_t off = 0;
+#ifdef _win_
+ if (path[0] > 0 && isalpha(path[0]) && path[1] == ':')
+ off = 2; // skip drive letter ("d:")
+#endif // _win_
+ const size_t pos = path.find(':', off);
+ TRec cur;
+ cur.Path = path.substr(0, pos);
+ if (pos != TString::npos)
+ cur.Prefix = path.substr(pos + 1);
+ if (keys.size())
+ cur.Key = keys[i];
+ cur.Recursive = recursive;
+ cur.Fix();
+ recs.push_back(cur);
+ }
+
+ try {
+ if (listMd5) {
+ for (const auto& rec: recs) {
+ ListArchiveMd5(rec.Path, cutSlash);
+ }
+ } else if (list) {
+ for (const auto& rec: recs) {
+ ListArchive(rec.Path, cutSlash);
+ }
+ } else if (unpack) {
+ const TFsPath dir(unpackDir);
+ for (const auto& rec: recs) {
+ UnpackArchive(rec.Path, dir);
+ }
+ } else {
+ TAutoPtr<IOutputStream> outf(OpenOutput(outputf));
+ IOutputStream* out = outf.Get();
+ THolder<IOutputStream> hexout;
+
+ if (hexdump) {
+ hexout.Reset(new THexOutput(out));
+ out = hexout.Get();
+ } else if (stride) {
+ hexout.Reset(new TStringArrayOutput(out, stride));
+ out = hexout.Get();
+ } else if (yasmBase) {
+ hexout.Reset(new TYasmOutput(out, yasmBase));
+ out = hexout.Get();
+ } else if (cppBase) {
+ hexout.Reset(new TCStringOutput(out, cppBase));
+ out = hexout.Get();
+ }
+
+ outf->Write(prepend.data(), prepend.size());
+
+ if (cat) {
+ for (const auto& rec: recs) {
+ rec.Recurse(*out);
+ }
+ } else {
+ TDuplicatesMap duplicatesMap;
+ if (deduplicate) {
+ for (const auto& rec: recs) {
+ rec.Recurse(duplicatesMap);
+ }
+ }
+ duplicatesMap.Finish();
+ TDeduplicationArchiveWriter w(duplicatesMap, out, !doNotZip);
+ for (const auto& rec: recs) {
+ rec.Recurse(w);
+ }
+ w.Finish();
+ }
+
+ try {
+ out->Finish();
+ } catch (...) {
+ }
+
+ outf->Write(append.data(), append.size());
+ }
+ } catch (...) {
+ Cerr << CurrentExceptionMessage() << Endl;
+ return 1;
+ }
+
+ return 0;
+}