// Copyright 2019 The TCMalloc Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include <stddef.h> #include <algorithm> #include <cstdint> #include <limits> #include <memory> #include <new> #include <set> #include <thread> // NOLINT(build/c++11) #include <utility> #include <vector> #include "gmock/gmock.h" #include "gtest/gtest.h" #include "absl/container/flat_hash_map.h" #include "absl/synchronization/blocking_counter.h" #include "benchmark/benchmark.h" #include "tcmalloc/internal/declarations.h" #include "tcmalloc/internal/linked_list.h" #include "tcmalloc/malloc_extension.h" #include "tcmalloc/testing/testutil.h" namespace tcmalloc { namespace { TEST(AllocationSampleTest, TokenAbuse) { auto token = MallocExtension::StartAllocationProfiling(); void *ptr = ::operator new(512 * 1024 * 1024); // TODO(b/183453911): Remove workaround for GCC 10.x deleting operator new, // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94295. benchmark::DoNotOptimize(ptr); ::operator delete(ptr); // Repeated Claims should happily return null. auto profile = std::move(token).Stop(); int count = 0; profile.Iterate([&](const Profile::Sample &) { count++; }); #if !defined(UNDEFINED_BEHAVIOR_SANITIZER) // UBSan does not implement our profiling API, but running the test can // validate the correctness of the new/delete pairs. EXPECT_EQ(count, 1); #endif auto profile2 = std::move(token).Stop(); // NOLINT: use-after-move intended int count2 = 0; profile2.Iterate([&](const Profile::Sample &) { count2++; }); EXPECT_EQ(count2, 0); // Delete (on the scope ending) without Claim should also be OK. { MallocExtension::StartAllocationProfiling(); } } // Verify that profiling sessions concurrent with allocations do not crash due // to mutating pointers accessed by the sampling code (b/143623146). TEST(AllocationSampleTest, RaceToClaim) { MallocExtension::SetProfileSamplingRate(1 << 14); absl::BlockingCounter counter(2); std::atomic<bool> stop{false}; std::thread t1([&]() { counter.DecrementCount(); while (!stop) { auto token = MallocExtension::StartAllocationProfiling(); absl::SleepFor(absl::Microseconds(1)); auto profile = std::move(token).Stop(); } }); std::thread t2([&]() { counter.DecrementCount(); const int kNum = 1000000; std::vector<void *> ptrs; while (!stop) { for (int i = 0; i < kNum; i++) { ptrs.push_back(::operator new(1)); } for (void *p : ptrs) { sized_delete(p, 1); } ptrs.clear(); } }); // Verify the threads are up and running before we start the clock. counter.Wait(); absl::SleepFor(absl::Seconds(1)); stop.store(true); t1.join(); t2.join(); } TEST(AllocationSampleTest, SampleAccuracy) { // Disable GWP-ASan, since it allocates different sizes than normal samples. MallocExtension::SetGuardedSamplingRate(-1); // Allocate about 512 MiB each of various sizes. For _some_ but not all // sizes, delete it as we go--it shouldn't matter for the sample count. static const size_t kTotalPerSize = 512 * 1024 * 1024; // (object size, object alignment, keep objects) struct Requests { size_t size; size_t alignment; bool keep; // objects we don't delete as we go void *list = nullptr; }; std::vector<Requests> sizes = { {8, 0, false}, {16, 16, true}, {1024, 0, false}, {64 * 1024, 64, false}, {512 * 1024, 0, true}, {1024 * 1024, 128, true}}; std::set<size_t> sizes_expected; for (auto s : sizes) { sizes_expected.insert(s.size); } auto token = MallocExtension::StartAllocationProfiling(); // We use new/delete to allocate memory, as malloc returns objects aligned to // std::max_align_t. for (auto &s : sizes) { for (size_t bytes = 0; bytes < kTotalPerSize; bytes += s.size) { void *obj; if (s.alignment > 0) { obj = operator new(s.size, static_cast<std::align_val_t>(s.alignment)); } else { obj = operator new(s.size); } if (s.keep) { tcmalloc_internal::SLL_Push(&s.list, obj); } else if (s.alignment > 0) { operator delete(obj, static_cast<std::align_val_t>(s.alignment)); } else { operator delete(obj); } } } auto profile = std::move(token).Stop(); // size -> bytes seen absl::flat_hash_map<size_t, size_t> m; // size -> alignment request absl::flat_hash_map<size_t, size_t> alignment; for (auto s : sizes) { alignment[s.size] = s.alignment; } profile.Iterate([&](const tcmalloc::Profile::Sample &e) { // Skip unexpected sizes. They may have been triggered by a background // thread. if (sizes_expected.find(e.allocated_size) == sizes_expected.end()) { return; } // Don't check stack traces until we have evidence that's broken, it's // tedious and done fairly well elsewhere. m[e.allocated_size] += e.sum; EXPECT_EQ(alignment[e.requested_size], e.requested_alignment); }); #if !defined(UNDEFINED_BEHAVIOR_SANITIZER) // UBSan does not implement our profiling API, but running the test can // validate the correctness of the new/delete pairs. size_t max_bytes = 0, min_bytes = std::numeric_limits<size_t>::max(); EXPECT_EQ(m.size(), sizes_expected.size()); for (auto seen : m) { size_t bytes = seen.second; min_bytes = std::min(min_bytes, bytes); max_bytes = std::max(max_bytes, bytes); } // Hopefully we're in a fairly small range, that contains our actual // allocation. // TODO(b/134690164): better statistical tests here. EXPECT_GE((min_bytes * 3) / 2, max_bytes); EXPECT_LE((min_bytes * 3) / 4, kTotalPerSize); EXPECT_LE(kTotalPerSize, (max_bytes * 4) / 3); #endif // Remove the objects we left alive for (auto &s : sizes) { while (s.list != nullptr) { void *obj = tcmalloc_internal::SLL_Pop(&s.list); if (s.alignment > 0) { operator delete(obj, static_cast<std::align_val_t>(s.alignment)); } else { operator delete(obj); } } } } TEST(FragmentationzTest, Accuracy) { // Disable GWP-ASan, since it allocates different sizes than normal samples. MallocExtension::SetGuardedSamplingRate(-1); // a fairly odd allocation size - will be rounded to 128. This lets // us find our record in the table. static const size_t kItemSize = 115; // allocate about 3.5 GiB: static const size_t kNumItems = 32 * 1024 * 1024; std::vector<std::unique_ptr<char[]>> keep; std::vector<std::unique_ptr<char[]>> drop; // hint expected sizes: drop.reserve(kNumItems * 8 / 10); keep.reserve(kNumItems * 2 / 10); // We allocate many items, then free 80% of them "randomly". (To // decrease noise and speed up, we just keep every 5th one exactly.) for (int i = 0; i < kNumItems; ++i) { // Ideally we should use a malloc() here, for consistency; but unique_ptr // doesn't come with a have a "free()" deleter; use ::operator new insted. (i % 5 == 0 ? keep : drop) .push_back(std::unique_ptr<char[]>( static_cast<char *>(::operator new[](kItemSize)))); } drop.resize(0); // there are at least 64 items per span here. (8/10)^64 = 6.2e-7 ~= 0 // probability we actually managed to free a page; every page is fragmented. // We still have 20% or so of it allocated, so we should see 80% of it // charged to these allocations as fragmentations. auto profile = MallocExtension::SnapshotCurrent(ProfileType::kFragmentation); // Pull out the fragmentationz entry corresponding to this size_t requested_size = 0; size_t allocated_size = 0; size_t sum = 0; size_t count = 0; profile.Iterate([&](const Profile::Sample &e) { if (e.requested_size != kItemSize) return; if (requested_size == 0) { allocated_size = e.allocated_size; requested_size = e.requested_size; } else { // we will usually have single entry in // profile, but in builds without optimization // our fast-path code causes same call-site to // have two different stack traces. Thus we // expect and deal with second entry for same // allocation. EXPECT_EQ(requested_size, e.requested_size); EXPECT_EQ(allocated_size, e.allocated_size); } sum += e.sum; count += e.count; }); double frag_bytes = sum; double real_frag_bytes = static_cast<double>(allocated_size * kNumItems) * 0.8; // We should be pretty close with this much data: // TODO(b/134690164): this is still slightly flaky (<1%) - why? EXPECT_NEAR(real_frag_bytes, frag_bytes, real_frag_bytes * 0.15) << " sum = " << sum << " allocated = " << allocated_size << " requested = " << requested_size << " count = " << count; } } // namespace } // namespace tcmalloc