aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEgor Zudin <e-zudin@ydb.tech>2024-01-17 19:15:27 +0300
committerGitHub <noreply@github.com>2024-01-17 17:15:27 +0100
commit39325bd60d9c436b7d613e8c186b9e71c83a901f (patch)
tree7a47299a2e7958146770295f9778653f22adaa65
parent84e7ca09fe02af8bcbecf818501f8edc6fa0c97a (diff)
downloadydb-39325bd60d9c436b7d613e8c186b9e71c83a901f.tar.gz
YQ-2744: add tpc bucket option for ydb workload tpch (#1098)
* YQ-2744: add tpc bucket option for ydb workload tpch * Move bucket to cli parameters
-rw-r--r--ydb/public/lib/ydb_cli/commands/tpch.cpp62
-rw-r--r--ydb/public/lib/ydb_cli/commands/tpch.h3
-rw-r--r--ydb/public/lib/ydb_cli/commands/tpch_schema.sql98
-rw-r--r--ydb/public/lib/ydb_cli/commands/ya.make1
4 files changed, 105 insertions, 59 deletions
diff --git a/ydb/public/lib/ydb_cli/commands/tpch.cpp b/ydb/public/lib/ydb_cli/commands/tpch.cpp
index 026e572f60..0874e0b2ce 100644
--- a/ydb/public/lib/ydb_cli/commands/tpch.cpp
+++ b/ydb/public/lib/ydb_cli/commands/tpch.cpp
@@ -1,5 +1,6 @@
#include "tpch.h"
+#include <contrib/libs/fmt/include/fmt/format.h>
#include <util/string/split.h>
#include <util/stream/file.h>
#include <util/string/strip.h>
@@ -216,10 +217,19 @@ void TTpchCommandInit::Config(TConfig& config) {
TablesPath = arg;
});
config.Opts->AddLongOption("store", "Storage type."
- " Options: row, column\n"
+ " Options: row, column, s3\n"
"row - use row-based storage engine;\n"
- "column - use column-based storage engine.")
+ "column - use column-based storage engine.\n"
+ "s3 - use cloud tpc bucket")
.DefaultValue("row").StoreResult(&StoreType);
+ config.Opts->AddLongOption('s', "scale", "TPC-H dataset scale. One of 1, 10, 100, 1000. Default is 1")
+ .Optional()
+ .DefaultValue("1")
+ .StoreResult(&Scale);
+ config.Opts->AddLongOption('b', "bucket", "S3 bucket with TPC-H dataset")
+ .Optional()
+ .DefaultValue("")
+ .StoreResult(&Bucket);
};
void TTpchCommandInit::SetPartitionByCols(TString& createSql) {
@@ -248,10 +258,28 @@ int TTpchCommandInit::Run(TConfig& config) {
StoreType = to_lower(StoreType);
TString storageType = "";
TString notNull = "";
+ TString createExternalDataSource;
+ TString external;
+ TString partitioning = "AUTO_PARTITIONING_MIN_PARTITIONS_COUNT";
+ TString primaryKey = ", PRIMARY KEY";
if (StoreType == "column") {
- storageType = "STORE = COLUMN,";
+ storageType = "STORE = COLUMN, --";
notNull = "NOT NULL";
+ } else if (StoreType == "s3") {
+ storageType = R"(DATA_SOURCE = "_tpc_s3_external_source", FORMAT = "parquet", LOCATION = )";
+ notNull = "NOT NULL";
+ createExternalDataSource = fmt::format(R"(
+ CREATE EXTERNAL DATA SOURCE `_tpc_s3_external_source` WITH (
+ SOURCE_TYPE="ObjectStorage",
+ LOCATION="https://storage.yandexcloud.net/{}/",
+ AUTH_METHOD="NONE"
+ );
+ )", Bucket);
+ external = "EXTERNAL";
+ partitioning = "--";
+ primaryKey = "--";
} else if (StoreType != "row") {
+ storageType = "-- ";
throw yexception() << "Incorrect storage type. Available options: \"row\", \"column\"." << Endl;
}
@@ -260,11 +288,18 @@ int TTpchCommandInit::Run(TConfig& config) {
TString createSql = NResource::Find("tpch_schema.sql");
TTableClient client(driver);
+ SubstGlobal(createSql, "{createExternal}", createExternalDataSource);
+ SubstGlobal(createSql, "{external}", external);
SubstGlobal(createSql, "{notnull}", notNull);
+ SubstGlobal(createSql, "{partitioning}", partitioning);
+ SubstGlobal(createSql, "{primary_key}", primaryKey);
SubstGlobal(createSql, "{path}", TablesPath);
+ SubstGlobal(createSql, "{scale}", Scale);
SubstGlobal(createSql, "{store}", storageType);
SetPartitionByCols(createSql);
+ Cout << createSql << Endl;
+
ThrowOnError(client.RetryOperationSync([createSql](TSession session) {
return session.ExecuteSchemeQuery(createSql).GetValueSync();
}));
@@ -281,25 +316,30 @@ TTpchCommandClean::TTpchCommandClean()
void TTpchCommandClean::Config(TConfig& config) {
NYdb::NConsoleClient::TClientCommand::Config(config);
config.SetFreeArgsNum(0);
+ config.Opts->AddLongOption('e', "external", "Drop tables as external. Use if initialized with external storage")
+ .Optional()
+ .StoreTrue(&IsExternal);
};
int TTpchCommandClean::Run(TConfig& config) {
auto driver = CreateDriver(config);
TTableClient client(driver);
- static const char DropDdlTmpl[] = "DROP TABLE `%s`;";
- char dropDdl[sizeof(DropDdlTmpl) + 8192*3]; // 32*256 for DbPath
+ TString dropDdl;
for (auto& table : Tables) {
TString fullPath = FullTablePath(config.Database, table);
- int res = std::snprintf(dropDdl, sizeof(dropDdl), DropDdlTmpl, fullPath.c_str());
- if (res < 0) {
- Cerr << "Failed to generate DROP DDL query for `" << fullPath << "` table." << Endl;
- return -1;
- }
+ fmt::format_to(std::back_inserter(dropDdl), "DROP {} TABLE `{}`", IsExternal ? "EXTERNAL" : "", fullPath);
- ThrowOnError(client.RetryOperationSync([dropDdl](TSession session) {
+ ThrowOnError(client.RetryOperationSync([&dropDdl](TSession session) {
return session.ExecuteSchemeQuery(dropDdl).GetValueSync();
}));
+ dropDdl.clear();
+ }
+
+ if (IsExternal) {
+ ThrowOnError(client.RetryOperationSync([](TSession session) {
+ return session.ExecuteSchemeQuery("DROP EXTERNAL DATA SOURCE `_tpc_s3_external_source`;").GetValueSync();
+ }));
}
Cout << "Clean succeeded." << Endl;
diff --git a/ydb/public/lib/ydb_cli/commands/tpch.h b/ydb/public/lib/ydb_cli/commands/tpch.h
index 28f7da178b..a12c63f573 100644
--- a/ydb/public/lib/ydb_cli/commands/tpch.h
+++ b/ydb/public/lib/ydb_cli/commands/tpch.h
@@ -17,6 +17,8 @@ private:
TString TablesPath;
TString StoreType;
+ TString Scale;
+ TString Bucket;
};
class TTpchCommandClean : public NYdb::NConsoleClient::TYdbCommand {
@@ -28,6 +30,7 @@ public:
private:
std::vector<TString> Tables = {"customer", "lineitem", "nation", "orders",
"region", "part", "partsupp", "supplier"};
+ bool IsExternal = false;
};
class TTpchCommandRun : public NYdb::NConsoleClient::TYdbCommand {
diff --git a/ydb/public/lib/ydb_cli/commands/tpch_schema.sql b/ydb/public/lib/ydb_cli/commands/tpch_schema.sql
index 5ef326da56..ed2b512081 100644
--- a/ydb/public/lib/ydb_cli/commands/tpch_schema.sql
+++ b/ydb/public/lib/ydb_cli/commands/tpch_schema.sql
@@ -1,4 +1,6 @@
-CREATE TABLE `{path}customer` (
+{createExternal}
+
+CREATE {external} TABLE `{path}customer` (
c_acctbal Double {notnull}, -- it should be Decimal(12, 2)
c_address String {notnull},
c_comment String {notnull},
@@ -6,15 +8,15 @@ CREATE TABLE `{path}customer` (
c_mktsegment String {notnull},
c_name String {notnull},
c_nationkey Int32 {notnull}, -- FK to N_NATIONKEY
- c_phone String {notnull},
- PRIMARY KEY (c_custkey)
+ c_phone String {notnull}
+ {primary_key} (c_custkey)
)
{partition_customer}
-WITH ({store}
-AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = 64)
-;
+WITH ({store}"/h/s{scale}/parquet/customer/"
+{partitioning} = 64
+);
-CREATE TABLE `{path}lineitem` (
+CREATE {external} TABLE `{path}lineitem` (
l_comment String {notnull},
l_commitdate Date {notnull},
l_discount Double {notnull}, -- it should be Decimal(12, 2)
@@ -30,27 +32,27 @@ CREATE TABLE `{path}lineitem` (
l_shipinstruct String {notnull},
l_shipmode String {notnull},
l_suppkey Int32 {notnull}, -- FK to S_SUPPKEY, second part of the compound FK to (PS_PARTKEY, PS_SUPPKEY) with L_PARTKEY
- l_tax Double {notnull}, -- it should be Decimal(12, 2)
- PRIMARY KEY (l_orderkey, l_linenumber)
+ l_tax Double {notnull} -- it should be Decimal(12, 2)
+ {primary_key} (l_orderkey, l_linenumber)
)
{partition_lineitem}
-WITH ({store}
-AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = 64)
-;
+WITH ({store}"/h/s{scale}/parquet/lineitem/"
+{partitioning} = 64
+);
-CREATE TABLE `{path}nation` (
+CREATE {external} TABLE `{path}nation` (
n_comment String {notnull},
n_name String {notnull},
n_nationkey Int32 {notnull}, -- Identifier
- n_regionkey Int32 {notnull}, -- FK to R_REGIONKEY
- PRIMARY KEY(n_nationkey)
+ n_regionkey Int32 {notnull} -- FK to R_REGIONKEY
+ {primary_key}(n_nationkey)
)
{partition_nation}
-WITH ({store}
-AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = 1)
-;
+WITH ({store}"/h/s{scale}/parquet/nation/"
+{partitioning} = 1
+);
-CREATE TABLE `{path}orders` (
+CREATE {external} TABLE `{path}orders` (
o_clerk String {notnull},
o_comment String {notnull},
o_custkey Int32 {notnull}, -- FK to C_CUSTKEY
@@ -59,15 +61,15 @@ CREATE TABLE `{path}orders` (
o_orderpriority String {notnull},
o_orderstatus String {notnull},
o_shippriority Int32 {notnull},
- o_totalprice Double {notnull}, -- it should be Decimal(12, 2)
- PRIMARY KEY (o_orderkey)
+ o_totalprice Double {notnull} -- it should be Decimal(12, 2)
+ {primary_key} (o_orderkey)
)
{partition_orders}
-WITH ({store}
-AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = 64)
-;
+WITH ({store}"/h/s{scale}/parquet/orders/"
+{partitioning} = 64
+);
-CREATE TABLE `{path}part` (
+CREATE {external} TABLE `{path}part` (
p_brand String {notnull},
p_comment String {notnull},
p_container String {notnull},
@@ -76,49 +78,49 @@ CREATE TABLE `{path}part` (
p_partkey Int32 {notnull}, -- Identifier
p_retailprice Double {notnull}, -- it should be Decimal(12, 2)
p_size Int32 {notnull},
- p_type String {notnull},
- PRIMARY KEY(p_partkey)
+ p_type String {notnull}
+ {primary_key}(p_partkey)
)
{partition_part}
-WITH ({store}
-AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = 64)
-;
+WITH ({store}"/h/s{scale}/parquet/part/"
+{partitioning} = 64
+);
-CREATE TABLE `{path}partsupp` (
+CREATE {external} TABLE `{path}partsupp` (
ps_availqty Int32 {notnull},
ps_comment String {notnull},
ps_partkey Int32 {notnull}, -- FK to P_PARTKEY
ps_suppkey Int32 {notnull}, -- FK to S_SUPPKEY
- ps_supplycost Double {notnull}, -- it should be Decimal(12, 2)
- PRIMARY KEY(ps_partkey, ps_suppkey)
+ ps_supplycost Double {notnull} -- it should be Decimal(12, 2)
+ {primary_key}(ps_partkey, ps_suppkey)
)
{partition_partsupp}
-WITH ({store}
-AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = 64)
-;
+WITH ({store}"/h/s{scale}/parquet/partsupp/"
+{partitioning} = 64
+);
-CREATE TABLE `{path}region` (
+CREATE {external} TABLE `{path}region` (
r_comment String {notnull},
r_name String {notnull},
- r_regionkey Int32 {notnull}, -- Identifier
- PRIMARY KEY(r_regionkey)
+ r_regionkey Int32 {notnull} -- Identifier
+ {primary_key}(r_regionkey)
)
{partition_region}
-WITH ({store}
-AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = 1)
-;
+WITH ({store}"/h/s{scale}/parquet/region/"
+{partitioning} = 1
+);
-CREATE TABLE `{path}supplier` (
+CREATE {external} TABLE `{path}supplier` (
s_acctbal Double {notnull}, -- it should be Decimal(12, 2)
s_address String {notnull},
s_comment String {notnull},
s_name String {notnull},
s_nationkey Int32 {notnull}, -- FK to N_NATIONKEY
s_phone String {notnull},
- s_suppkey Int32 {notnull}, -- Identifier
- PRIMARY KEY(s_suppkey)
+ s_suppkey Int32 {notnull} -- Identifier
+ {primary_key}(s_suppkey)
)
{partition_supplier}
-WITH ({store}
-AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = 64)
-;
+WITH ({store}"/h/s{scale}/parquet/supplier/"
+{partitioning} = 64
+);
diff --git a/ydb/public/lib/ydb_cli/commands/ya.make b/ydb/public/lib/ydb_cli/commands/ya.make
index af38f94ce2..35472dbb12 100644
--- a/ydb/public/lib/ydb_cli/commands/ya.make
+++ b/ydb/public/lib/ydb_cli/commands/ya.make
@@ -35,6 +35,7 @@ SRCS(
)
PEERDIR(
+ contrib/libs/fmt
contrib/restricted/patched/replxx
library/cpp/histogram/hdr
library/cpp/protobuf/json