Moved other yql/essentials libs YQL-19206

init commit_hash:7d4c435602078407bbf20dd3c32f9c90d2bbcbc0
author: vvvv <vvvv@yandex-team.com> 2024-11-07 12:29:36 +0300
committer: vvvv <vvvv@yandex-team.com> 2024-11-07 13:49:47 +0300
commit: d4c258e9431675bab6745c8638df6e3dfd4dca6b (patch)
tree: b5efcfa11351152a4c872fccaea35749141c0b11 /yql/essentials/parser/pg_wrapper/postgresql/src/backend/nodes/queryjumblefuncs.c
parent: 13a4f274caef5cfdaf0263b24e4d6bdd5521472b (diff)
download: ydb-d4c258e9431675bab6745c8638df6e3dfd4dca6b.tar.gz
1 files changed, 397 insertions, 0 deletions
diff --git a/yql/essentials/parser/pg_wrapper/postgresql/src/backend/nodes/queryjumblefuncs.c b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/nodes/queryjumblefuncs.c
new file mode 100644
index 00000000000..be4cb66066b
--- /dev/null
+++ b/yql/essentials/parser/pg_wrapper/postgresql/src/backend/nodes/queryjumblefuncs.c
@@ -0,0 +1,397 @@
+/*-------------------------------------------------------------------------
+ *
+ * queryjumblefuncs.c
+ *	 Query normalization and fingerprinting.
+ *
+ * Normalization is a process whereby similar queries, typically differing only
+ * in their constants (though the exact rules are somewhat more subtle than
+ * that) are recognized as equivalent, and are tracked as a single entry.  This
+ * is particularly useful for non-prepared queries.
+ *
+ * Normalization is implemented by fingerprinting queries, selectively
+ * serializing those fields of each query tree's nodes that are judged to be
+ * essential to the query.  This is referred to as a query jumble.  This is
+ * distinct from a regular serialization in that various extraneous
+ * information is ignored as irrelevant or not essential to the query, such
+ * as the collations of Vars and, most notably, the values of constants.
+ *
+ * This jumble is acquired at the end of parse analysis of each query, and
+ * a 64-bit hash of it is stored into the query's Query.queryId field.
+ * The server then copies this value around, making it available in plan
+ * tree(s) generated from the query.  The executor can then use this value
+ * to blame query costs on the proper queryId.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/nodes/queryjumblefuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "common/hashfn.h"
+#include "miscadmin.h"
+#include "nodes/queryjumble.h"
+#include "parser/scansup.h"
+
+#define JUMBLE_SIZE				1024	/* query serialization buffer size */
+
+/* GUC parameters */
+__thread int			compute_query_id = COMPUTE_QUERY_ID_AUTO;
+
+/* True when compute_query_id is ON, or AUTO and a module requests them */
+__thread bool		query_id_enabled = false;
+
+static void AppendJumble(JumbleState *jstate,
+						 const unsigned char *item, Size size);
+static void RecordConstLocation(JumbleState *jstate, int location);
+static void _jumbleNode(JumbleState *jstate, Node *node);
+static void _jumbleA_Const(JumbleState *jstate, Node *node);
+static void _jumbleList(JumbleState *jstate, Node *node);
+static void _jumbleRangeTblEntry(JumbleState *jstate, Node *node);
+
+/*
+ * Given a possibly multi-statement source string, confine our attention to the
+ * relevant part of the string.
+ */
+const char *
+CleanQuerytext(const char *query, int *location, int *len)
+{
+	int			query_location = *location;
+	int			query_len = *len;
+
+	/* First apply starting offset, unless it's -1 (unknown). */
+	if (query_location >= 0)
+	{
+		Assert(query_location <= strlen(query));
+		query += query_location;
+		/* Length of 0 (or -1) means "rest of string" */
+		if (query_len <= 0)
+			query_len = strlen(query);
+		else
+			Assert(query_len <= strlen(query));
+	}
+	else
+	{
+		/* If query location is unknown, distrust query_len as well */
+		query_location = 0;
+		query_len = strlen(query);
+	}
+
+	/*
+	 * Discard leading and trailing whitespace, too.  Use scanner_isspace()
+	 * not libc's isspace(), because we want to match the lexer's behavior.
+	 */
+	while (query_len > 0 && scanner_isspace(query[0]))
+		query++, query_location++, query_len--;
+	while (query_len > 0 && scanner_isspace(query[query_len - 1]))
+		query_len--;
+
+	*location = query_location;
+	*len = query_len;
+
+	return query;
+}
+
+JumbleState *
+JumbleQuery(Query *query)
+{
+	JumbleState *jstate = NULL;
+
+	Assert(IsQueryIdEnabled());
+
+	jstate = (JumbleState *) palloc(sizeof(JumbleState));
+
+	/* Set up workspace for query jumbling */
+	jstate->jumble = (unsigned char *) palloc(JUMBLE_SIZE);
+	jstate->jumble_len = 0;
+	jstate->clocations_buf_size = 32;
+	jstate->clocations = (LocationLen *)
+		palloc(jstate->clocations_buf_size * sizeof(LocationLen));
+	jstate->clocations_count = 0;
+	jstate->highest_extern_param_id = 0;
+
+	/* Compute query ID and mark the Query node with it */
+	_jumbleNode(jstate, (Node *) query);
+	query->queryId = DatumGetUInt64(hash_any_extended(jstate->jumble,
+													  jstate->jumble_len,
+													  0));
+
+	/*
+	 * If we are unlucky enough to get a hash of zero, use 1 instead for
+	 * normal statements and 2 for utility queries.
+	 */
+	if (query->queryId == UINT64CONST(0))
+	{
+		if (query->utilityStmt)
+			query->queryId = UINT64CONST(2);
+		else
+			query->queryId = UINT64CONST(1);
+	}
+
+	return jstate;
+}
+
+/*
+ * Enables query identifier computation.
+ *
+ * Third-party plugins can use this function to inform core that they require
+ * a query identifier to be computed.
+ */
+void
+EnableQueryId(void)
+{
+	if (compute_query_id != COMPUTE_QUERY_ID_OFF)
+		query_id_enabled = true;
+}
+
+/*
+ * AppendJumble: Append a value that is substantive in a given query to
+ * the current jumble.
+ */
+static void
+AppendJumble(JumbleState *jstate, const unsigned char *item, Size size)
+{
+	unsigned char *jumble = jstate->jumble;
+	Size		jumble_len = jstate->jumble_len;
+
+	/*
+	 * Whenever the jumble buffer is full, we hash the current contents and
+	 * reset the buffer to contain just that hash value, thus relying on the
+	 * hash to summarize everything so far.
+	 */
+	while (size > 0)
+	{
+		Size		part_size;
+
+		if (jumble_len >= JUMBLE_SIZE)
+		{
+			uint64		start_hash;
+
+			start_hash = DatumGetUInt64(hash_any_extended(jumble,
+														  JUMBLE_SIZE, 0));
+			memcpy(jumble, &start_hash, sizeof(start_hash));
+			jumble_len = sizeof(start_hash);
+		}
+		part_size = Min(size, JUMBLE_SIZE - jumble_len);
+		memcpy(jumble + jumble_len, item, part_size);
+		jumble_len += part_size;
+		item += part_size;
+		size -= part_size;
+	}
+	jstate->jumble_len = jumble_len;
+}
+
+/*
+ * Record location of constant within query string of query tree
+ * that is currently being walked.
+ */
+static void
+RecordConstLocation(JumbleState *jstate, int location)
+{
+	/* -1 indicates unknown or undefined location */
+	if (location >= 0)
+	{
+		/* enlarge array if needed */
+		if (jstate->clocations_count >= jstate->clocations_buf_size)
+		{
+			jstate->clocations_buf_size *= 2;
+			jstate->clocations = (LocationLen *)
+				repalloc(jstate->clocations,
+						 jstate->clocations_buf_size *
+						 sizeof(LocationLen));
+		}
+		jstate->clocations[jstate->clocations_count].location = location;
+		/* initialize lengths to -1 to simplify third-party module usage */
+		jstate->clocations[jstate->clocations_count].length = -1;
+		jstate->clocations_count++;
+	}
+}
+
+#define JUMBLE_NODE(item) \
+	_jumbleNode(jstate, (Node *) expr->item)
+#define JUMBLE_LOCATION(location) \
+	RecordConstLocation(jstate, expr->location)
+#define JUMBLE_FIELD(item) \
+	AppendJumble(jstate, (const unsigned char *) &(expr->item), sizeof(expr->item))
+#define JUMBLE_FIELD_SINGLE(item) \
+	AppendJumble(jstate, (const unsigned char *) &(item), sizeof(item))
+#define JUMBLE_STRING(str) \
+do { \
+	if (expr->str) \
+		AppendJumble(jstate, (const unsigned char *) (expr->str), strlen(expr->str) + 1); \
+} while(0)
+
+#include "queryjumblefuncs.funcs.c"
+
+static void
+_jumbleNode(JumbleState *jstate, Node *node)
+{
+	Node	   *expr = node;
+
+	if (expr == NULL)
+		return;
+
+	/* Guard against stack overflow due to overly complex expressions */
+	check_stack_depth();
+
+	/*
+	 * We always emit the node's NodeTag, then any additional fields that are
+	 * considered significant, and then we recurse to any child nodes.
+	 */
+	JUMBLE_FIELD(type);
+
+	switch (nodeTag(expr))
+	{
+#include "queryjumblefuncs.switch.c"
+
+		case T_List:
+		case T_IntList:
+		case T_OidList:
+		case T_XidList:
+			_jumbleList(jstate, expr);
+			break;
+
+		default:
+			/* Only a warning, since we can stumble along anyway */
+			elog(WARNING, "unrecognized node type: %d",
+				 (int) nodeTag(expr));
+			break;
+	}
+
+	/* Special cases to handle outside the automated code */
+	switch (nodeTag(expr))
+	{
+		case T_Param:
+			{
+				Param	   *p = (Param *) node;
+
+				/*
+				 * Update the highest Param id seen, in order to start
+				 * normalization correctly.
+				 */
+				if (p->paramkind == PARAM_EXTERN &&
+					p->paramid > jstate->highest_extern_param_id)
+					jstate->highest_extern_param_id = p->paramid;
+			}
+			break;
+		default:
+			break;
+	}
+}
+
+static void
+_jumbleList(JumbleState *jstate, Node *node)
+{
+	List	   *expr = (List *) node;
+	ListCell   *l;
+
+	switch (expr->type)
+	{
+		case T_List:
+			foreach(l, expr)
+				_jumbleNode(jstate, lfirst(l));
+			break;
+		case T_IntList:
+			foreach(l, expr)
+				JUMBLE_FIELD_SINGLE(lfirst_int(l));
+			break;
+		case T_OidList:
+			foreach(l, expr)
+				JUMBLE_FIELD_SINGLE(lfirst_oid(l));
+			break;
+		case T_XidList:
+			foreach(l, expr)
+				JUMBLE_FIELD_SINGLE(lfirst_xid(l));
+			break;
+		default:
+			elog(ERROR, "unrecognized list node type: %d",
+				 (int) expr->type);
+			return;
+	}
+}
+
+static void
+_jumbleA_Const(JumbleState *jstate, Node *node)
+{
+	A_Const    *expr = (A_Const *) node;
+
+	JUMBLE_FIELD(isnull);
+	if (!expr->isnull)
+	{
+		JUMBLE_FIELD(val.node.type);
+		switch (nodeTag(&expr->val))
+		{
+			case T_Integer:
+				JUMBLE_FIELD(val.ival.ival);
+				break;
+			case T_Float:
+				JUMBLE_STRING(val.fval.fval);
+				break;
+			case T_Boolean:
+				JUMBLE_FIELD(val.boolval.boolval);
+				break;
+			case T_String:
+				JUMBLE_STRING(val.sval.sval);
+				break;
+			case T_BitString:
+				JUMBLE_STRING(val.bsval.bsval);
+				break;
+			default:
+				elog(ERROR, "unrecognized node type: %d",
+					 (int) nodeTag(&expr->val));
+				break;
+		}
+	}
+}
+
+static void
+_jumbleRangeTblEntry(JumbleState *jstate, Node *node)
+{
+	RangeTblEntry *expr = (RangeTblEntry *) node;
+
+	JUMBLE_FIELD(rtekind);
+	switch (expr->rtekind)
+	{
+		case RTE_RELATION:
+			JUMBLE_FIELD(relid);
+			JUMBLE_NODE(tablesample);
+			JUMBLE_FIELD(inh);
+			break;
+		case RTE_SUBQUERY:
+			JUMBLE_NODE(subquery);
+			break;
+		case RTE_JOIN:
+			JUMBLE_FIELD(jointype);
+			break;
+		case RTE_FUNCTION:
+			JUMBLE_NODE(functions);
+			break;
+		case RTE_TABLEFUNC:
+			JUMBLE_NODE(tablefunc);
+			break;
+		case RTE_VALUES:
+			JUMBLE_NODE(values_lists);
+			break;
+		case RTE_CTE:
+
+			/*
+			 * Depending on the CTE name here isn't ideal, but it's the only
+			 * info we have to identify the referenced WITH item.
+			 */
+			JUMBLE_STRING(ctename);
+			JUMBLE_FIELD(ctelevelsup);
+			break;
+		case RTE_NAMEDTUPLESTORE:
+			JUMBLE_STRING(enrname);
+			break;
+		case RTE_RESULT:
+			break;
+		default:
+			elog(ERROR, "unrecognized RTE kind: %d", (int) expr->rtekind);
+			break;
+	}
+}
author	vvvv <vvvv@yandex-team.com>	2024-11-07 12:29:36 +0300
committer	vvvv <vvvv@yandex-team.com>	2024-11-07 13:49:47 +0300
commit	d4c258e9431675bab6745c8638df6e3dfd4dca6b (patch)
tree	b5efcfa11351152a4c872fccaea35749141c0b11 /yql/essentials/parser/pg_wrapper/postgresql/src/backend/nodes/queryjumblefuncs.c
parent	13a4f274caef5cfdaf0263b24e4d6bdd5521472b (diff)
download	ydb-d4c258e9431675bab6745c8638df6e3dfd4dca6b.tar.gz