// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/trie.h" #include #include #include "contrib/libs/apache/arrow_next/cpp/src/arrow/util/logging.h" namespace arrow20 { namespace internal { Status Trie::Validate() const { const auto n_nodes = static_cast(nodes_.size()); if (size_ > n_nodes) { return Status::Invalid("Number of entries larger than number of nodes"); } for (const auto& node : nodes_) { if (node.found_index_ >= size_) { return Status::Invalid("Found index >= size"); } if (node.child_lookup_ != -1 && node.child_lookup_ * 256 > static_cast(lookup_table_.size() - 256)) { return Status::Invalid("Child lookup base doesn't point to 256 valid indices"); } } for (const auto index : lookup_table_) { if (index >= n_nodes) { return Status::Invalid("Child lookup index out of bounds"); } } return Status::OK(); } void Trie::Dump(const Node* node, const std::string& indent) const { std::cerr << "[\"" << node->substring_ << "\"]"; if (node->found_index_ >= 0) { std::cerr << " *"; } std::cerr << "\n"; if (node->child_lookup_ >= 0) { auto child_indent = indent + " "; std::cerr << child_indent << "|\n"; for (fast_index_type i = 0; i < 256; ++i) { auto child_index = lookup_table_[node->child_lookup_ * 256 + i]; if (child_index >= 0) { const Node* child = &nodes_[child_index]; std::cerr << child_indent << "|-> '" << static_cast(i) << "' (" << i << ") -> "; Dump(child, child_indent); } } } } void Trie::Dump() const { Dump(&nodes_[0], ""); } TrieBuilder::TrieBuilder() { trie_.nodes_.push_back(Trie::Node{-1, -1, ""}); } Status TrieBuilder::AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node) { if (parent->child_lookup_ == -1) { RETURN_NOT_OK(ExtendLookupTable(&parent->child_lookup_)); } auto parent_lookup = parent->child_lookup_ * 256 + ch; DCHECK_EQ(trie_.lookup_table_[parent_lookup], -1); if (trie_.nodes_.size() >= static_cast(kMaxIndex)) { auto max_capacity = kMaxIndex; return Status::CapacityError("TrieBuilder cannot contain more than ", max_capacity, " child nodes"); } trie_.nodes_.push_back(std::move(node)); trie_.lookup_table_[parent_lookup] = static_cast(trie_.nodes_.size() - 1); return Status::OK(); } Status TrieBuilder::CreateChildNode(Trie::Node* parent, uint8_t ch, std::string_view substring) { const auto kMaxSubstringLength = Trie::kMaxSubstringLength; while (substring.length() > kMaxSubstringLength) { // Substring doesn't fit in node => create intermediate node auto mid_node = Trie::Node{-1, -1, substring.substr(0, kMaxSubstringLength)}; RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(mid_node))); // Recurse parent = &trie_.nodes_.back(); ch = static_cast(substring[kMaxSubstringLength]); substring = substring.substr(kMaxSubstringLength + 1); } // Create final matching node auto child_node = Trie::Node{trie_.size_, -1, substring}; RETURN_NOT_OK(AppendChildNode(parent, ch, std::move(child_node))); ++trie_.size_; return Status::OK(); } Status TrieBuilder::CreateChildNode(Trie::Node* parent, char ch, std::string_view substring) { return CreateChildNode(parent, static_cast(ch), substring); } Status TrieBuilder::ExtendLookupTable(index_type* out_index) { auto cur_size = trie_.lookup_table_.size(); auto cur_index = cur_size / 256; if (cur_index > static_cast(kMaxIndex)) { return Status::CapacityError("TrieBuilder cannot extend lookup table further"); } trie_.lookup_table_.resize(cur_size + 256, -1); *out_index = static_cast(cur_index); return Status::OK(); } Status TrieBuilder::SplitNode(fast_index_type node_index, fast_index_type split_at) { Trie::Node* node = &trie_.nodes_[node_index]; DCHECK_LT(split_at, node->substring_length()); // Before: // {node} -> [...] // After: // {node} -> [c] -> {out_node} -> [...] auto child_node = Trie::Node{node->found_index_, node->child_lookup_, node->substring_.substr(split_at + 1)}; auto ch = node->substring_[split_at]; node->child_lookup_ = -1; node->found_index_ = -1; node->substring_ = node->substring_.substr(0, split_at); RETURN_NOT_OK(AppendChildNode(node, ch, std::move(child_node))); return Status::OK(); } Status TrieBuilder::Append(std::string_view s, bool allow_duplicate) { // Find or create node for string fast_index_type node_index = 0; fast_index_type pos = 0; fast_index_type remaining = static_cast(s.length()); while (true) { Trie::Node* node = &trie_.nodes_[node_index]; const auto substring_length = node->substring_length(); const auto substring_data = node->substring_data(); for (fast_index_type i = 0; i < substring_length; ++i) { if (remaining == 0) { // New string too short => need to split node RETURN_NOT_OK(SplitNode(node_index, i)); // Current node matches exactly node = &trie_.nodes_[node_index]; node->found_index_ = trie_.size_++; return Status::OK(); } if (s[pos] != substring_data[i]) { // Mismatching substring => need to split node RETURN_NOT_OK(SplitNode(node_index, i)); // Create new node for mismatching char node = &trie_.nodes_[node_index]; return CreateChildNode(node, s[pos], s.substr(pos + 1)); } ++pos; --remaining; } if (remaining == 0) { // Node matches exactly if (node->found_index_ >= 0) { if (allow_duplicate) { return Status::OK(); } else { return Status::Invalid("Duplicate entry in trie"); } } node->found_index_ = trie_.size_++; return Status::OK(); } // Lookup child using next input character if (node->child_lookup_ == -1) { // Need to extend lookup table for this node RETURN_NOT_OK(ExtendLookupTable(&node->child_lookup_)); } auto c = static_cast(s[pos++]); --remaining; node_index = trie_.lookup_table_[node->child_lookup_ * 256 + c]; if (node_index == -1) { // Child not found => need to create child node return CreateChildNode(node, c, s.substr(pos)); } node = &trie_.nodes_[node_index]; } } Trie TrieBuilder::Finish() { return std::move(trie_); } } // namespace internal } // namespace arrow20