3 files changed, 222 insertions, 29 deletions
diff --git a/src/atrac/at3/atrac3_bitstream.cpp b/src/atrac/at3/atrac3_bitstream.cpp
index 56ef5a5..9b1a61d 100644
--- a/src/atrac/at3/atrac3_bitstream.cpp
+++ b/src/atrac/at3/atrac3_bitstream.cpp
@@ -19,6 +19,7 @@
 #include "atrac3_bitstream.h"
 #include "qmf/qmf.h"
 #include <atrac/atrac_psy_common.h>
+#include <atrac/atrac_enc_cache.h>
 #include <bitstream/bitstream.h>
 #include <util.h>
 #include <env.h>
@@ -147,40 +148,82 @@ uint32_t VLCEnc(const uint32_t selector, const int mantissas[TAtrac3Data::MaxSpe
     return bitsUsed;
 }
 
+// Cached per-BFU quantization result reused across the bit-allocation binary
+// search. For a fixed (channel, bfu, wordlen) within one frame the quantized
+// mantissas and their CLC/VLC costs are deterministic, so we compute them once.
+class TAt3SpecUnit : public TUnit {
+public:
+    // TEncCache::TProvideUnit factory: build the unit and quantize `values`.
+    static TUnit* Provide(size_t /*ch*/, size_t bfu, size_t wordlen, const float* values, void*) {
+        auto* u = new TAt3SpecUnit();
+        const uint32_t first = TAtrac3Data::BlockSizeTab[bfu];
+        const uint32_t last = TAtrac3Data::BlockSizeTab[bfu + 1];
+        const uint32_t blockSize = last - first;
+        const float mul = TAtrac3Data::MaxQuant[std::min((uint32_t)wordlen, (uint32_t)7)];
+
+        u->Wordlen = wordlen;
+        u->Multiplier = mul;
+        u->Mantisas.resize(blockSize);
+        // `ea` (extended/adaptive rounding) depends only on bfu, so it is
+        // constant for a given cache key.
+        u->EnergyErr = QuantMantisas(values, 0, blockSize, mul, bfu > LOSY_NAQ_START, u->Mantisas.data());
+        u->ClcBits = CLCEnc(wordlen, u->Mantisas.data(), blockSize, nullptr);
+        u->VlcBits = VLCEnc(wordlen, u->Mantisas.data(), blockSize, nullptr);
+        return u;
+    }
+
+    float EnergyErr = 0.0f;
+    uint32_t ClcBits = 0; // CLC spectrum cost (no per-block header bits)
+    uint32_t VlcBits = 0; // VLC spectrum cost (no per-block header bits)
+};
+
+// atrac3 has only MS stereo and BFUs carry no channel identity, so the cache
+// (reset per channel) is keyed purely on <bfu, wordlen>; `ch` is unused.
+static size_t MakeAt3SpecKey(size_t /*ch*/, size_t bfu, size_t wordlen) {
+    ASSERT(bfu < 32);
+    ASSERT(wordlen < 8);
+    return (bfu << 3) | wordlen;
+}
+// Upper bound on MakeAt3SpecKey(): bfu < 32, wordlen < 8.
+static constexpr size_t kAt3SpecCacheKeys = 1u << 8;
+
 std::pair<uint8_t, uint32_t> CalcSpecsBitsConsumption(const TAtrac3BitStreamWriter::TSingleChannelElement& sce,
                                                       const vector<uint32_t>& precisionPerEachBlocks,
                                                       int* mantisas,
-                                                      vector<float>& energyErr)
+                                                      vector<float>& energyErr,
+                                                      TEncCache& cache)
 {
     const vector<TScaledBlock>& scaledBlocks = sce.ScaledBlocks;
     const uint32_t numBlocks = precisionPerEachBlocks.size();
     uint32_t bitsUsed = numBlocks * 3;
 
-    auto lambda = [numBlocks, mantisas, &precisionPerEachBlocks, &scaledBlocks, &energyErr](bool clcMode, bool calcMant) {
-        uint32_t bits = 0;
-        for (uint32_t i = 0; i < numBlocks; ++i) {
-            if (precisionPerEachBlocks[i] == 0) {
-                continue;
-            }
-            bits += 6; // sfi
-            const uint32_t first = TAtrac3Data::BlockSizeTab[i];
-            const uint32_t last = TAtrac3Data::BlockSizeTab[i + 1];
-            const uint32_t blockSize = last - first;
-            const float mul = TAtrac3Data::MaxQuant[std::min(precisionPerEachBlocks[i], (uint32_t)7)];
-            if (calcMant) {
-                const float* values = scaledBlocks[i].Values.data();
-                energyErr[i] = QuantMantisas(values, first, last, mul, i > LOSY_NAQ_START, mantisas);
-            }
-            bits += clcMode ? CLCEnc(precisionPerEachBlocks[i], mantisas + first, blockSize, nullptr)
-                            : VLCEnc(precisionPerEachBlocks[i], mantisas + first, blockSize, nullptr);
+    // Per-block header (sfi) bits are common to both coding modes; only the
+    // spectrum cost differs. We accumulate the CLC and VLC spectrum costs from
+    // the cached units and pick the cheaper mode once at the end.
+    uint32_t clcSpecBits = 0;
+    uint32_t vlcSpecBits = 0;
+    for (uint32_t i = 0; i < numBlocks; ++i) {
+        if (precisionPerEachBlocks[i] == 0) {
+            continue;
         }
-        return bits;
-    };
+        bitsUsed += 6; // sfi
+        const uint32_t first = TAtrac3Data::BlockSizeTab[i];
+        const uint32_t last = TAtrac3Data::BlockSizeTab[i + 1];
+        const uint32_t blockSize = last - first;
+
+        auto* unit = static_cast<TAt3SpecUnit*>(
+            cache.GetOrCompute(0, i, precisionPerEachBlocks[i], scaledBlocks[i].Values.data()));
 
-    const uint32_t clcBits = lambda(true, true);
-    const uint32_t vlcBits = lambda(false, false);
-    const bool mode = clcBits <= vlcBits;
-    return std::make_pair(mode, bitsUsed + (mode ? clcBits : vlcBits));
+        // Mirror the cached block-local mantissas into the frame-global array
+        // for the eventual EncodeSpecs() dump.
+        std::copy_n(unit->GetMantisas().data(), blockSize, mantisas + first);
+        energyErr[i] = unit->EnergyErr;
+        clcSpecBits += unit->ClcBits;
+        vlcSpecBits += unit->VlcBits;
+    }
+
+    const bool mode = clcSpecBits <= vlcSpecBits;
+    return std::make_pair(mode, bitsUsed + (mode ? clcSpecBits : vlcSpecBits));
 }
 
 static inline bool CheckBfus(uint16_t* numBfu, const vector<uint32_t>& precisionPerEachBlocks)
@@ -593,7 +636,8 @@ public:
         ctx->EnergyErr.assign(ctx->NumBfu, 0.0f);
         std::pair<uint8_t, uint32_t> consumption;
         do {
-            consumption = CalcSpecsBitsConsumption(*ctx->Sce, tmpAlloc, ctx->Mantissas.data(), ctx->EnergyErr);
+            consumption = CalcSpecsBitsConsumption(*ctx->Sce, tmpAlloc, ctx->Mantissas.data(),
+                                                   ctx->EnergyErr, SpecCache);
         } while (ConsiderEnergyErr(ctx->EnergyErr, tmpAlloc));
 
         uint32_t totalBits = consumption.second + EncodeTonalComponents(*ctx->Sce, tmpAlloc, nullptr);
@@ -615,11 +659,13 @@ public:
     }
 
     void Dump(NBitStream::TBitStream& bs) override {
-        if (!Ctx) {
-            return;
+        if (Ctx) {
+            EncodeSpecs(*Ctx->Sce, &bs, Ctx->PrecisionPerBlock, Ctx->CodingMode, Ctx->Mantissas.data());
+            Ctx = nullptr;
         }
-        EncodeSpecs(*Ctx->Sce, &bs, Ctx->PrecisionPerBlock, Ctx->CodingMode, Ctx->Mantissas.data());
-        Ctx = nullptr;
+        // The cached quantization results are only valid for the channel/frame
+        // just finished; drop them before the next channel reuses this part.
+        SpecCache.Reset();
     }
 
     void Reset() noexcept override {
@@ -632,6 +678,7 @@ public:
 
 private:
     TEncodeCtx* Ctx = nullptr;
+    TEncCache SpecCache{kAt3SpecCacheKeys, &TAt3SpecUnit::Provide, &MakeAt3SpecKey};
 };
 
 std::vector<IBitStreamPartEncoder::TPtr> CreateEncParts()
diff --git a/src/atrac/atrac_enc_cache.cpp b/src/atrac/atrac_enc_cache.cpp
new file mode 100644
index 0000000..2dc781c
--- /dev/null
+++ b/src/atrac/atrac_enc_cache.cpp
@@ -0,0 +1,51 @@
+/*
+ * This file is part of AtracDEnc.
+ *
+ * AtracDEnc is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * AtracDEnc is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with AtracDEnc; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "atrac_enc_cache.h"
+
+namespace NAtracDEnc {
+
+TEncCache::TEncCache(size_t numKeys, TProvideUnit provideUnit, TMakeKey makeKey, void* opaque)
+    : UnitBuffers(numKeys)
+    , ProvideUnit(provideUnit)
+    , MakeKey(makeKey)
+    , Opaque(opaque)
+{
+}
+
+TUnit* TEncCache::GetOrCompute(size_t ch, size_t bfu, size_t wordlen, const float* values)
+{
+    const size_t key = MakeKey(ch, bfu, wordlen);
+
+    std::unique_ptr<TUnit>& slot = UnitBuffers[key];
+    if (!slot) {
+        slot.reset(ProvideUnit(ch, bfu, wordlen, values, Opaque));
+    }
+
+    return slot.get();
+}
+
+void TEncCache::Reset()
+{
+    // Keep the vector sized; just drop the cached units for the next frame.
+    for (std::unique_ptr<TUnit>& slot : UnitBuffers) {
+        slot.reset();
+    }
+}
+
+} // namespace NAtracDEnc
diff --git a/src/atrac/atrac_enc_cache.h b/src/atrac/atrac_enc_cache.h
new file mode 100644
index 0000000..0a9cdb1
--- /dev/null
+++ b/src/atrac/atrac_enc_cache.h
@@ -0,0 +1,95 @@
+#pragma once
+
+/*
+ * This file is part of AtracDEnc.
+ *
+ * AtracDEnc is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * AtracDEnc is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with AtracDEnc; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+namespace NAtracDEnc {
+
+// Codec-agnostic base for a single cached encoding unit (BFU / quant unit).
+//
+// A codec subclasses TUnit and quantizes the scaled spectrum into Mantisas,
+// filling the bookkeeping fields needed to later write the unit into the
+// stream. The actual computation lives in the user-supplied ProvideUnit
+// factory (see TEncCache) so the cached hot path carries no extra virtual
+// dispatch. The result is produced once per cache lifetime for a given key
+// and then reused across the bit-allocation search.
+class TUnit {
+public:
+    virtual ~TUnit() = default;
+
+    const std::vector<int>& GetMantisas() const { return Mantisas; }
+    uint32_t GetWordlen() const { return Wordlen; }
+    float GetMultiplier() const { return Multiplier; }
+    uint16_t GetConsumedBits() const { return ConsumedBits; }
+
+protected:
+    // Info needed to write the unit into the stream after encoding.
+    std::vector<int> Mantisas;
+    uint32_t Wordlen = 0;
+    float Multiplier = 1.0f;
+    uint16_t ConsumedBits = 0; // Number of bits consumed by the quantized spectrum
+};
+
+// Caches per-unit encoding results during the bit-allocation search.
+//
+// Within a single frame the scaled spectrum of a given (ch, bfu, wordlen)
+// is fixed, so its quantized mantissas and bit consumption are deterministic.
+// The binary search requests the same combinations repeatedly; this cache
+// computes each one once.
+//
+// The key space is small and dense, so units are stored in a vector that is
+// directly indexed by a user-supplied key function (nullptr slot == not yet
+// computed) rather than in a std::map.
+class TEncCache {
+public:
+    // Build the right TUnit subclass for this key and quantize `values` into
+    // it. Invoked only on a cache miss. `opaque` carries user context
+    // (e.g. scale tables / per-frame data).
+    using TProvideUnit = TUnit* (*)(size_t ch, size_t bfu, size_t wordlen,
+                                    const float* values, void* opaque);
+
+    // Pack (ch, bfu, wordlen) into a dense vector index. Codec specific.
+    using TMakeKey = size_t (*)(size_t ch, size_t bfu, size_t wordlen);
+
+    // `numKeys` is the upper bound on MakeKey() values; the backing vector is
+    // sized once to it. ProvideUnit/MakeKey must agree on this bound.
+    TEncCache(size_t numKeys, TProvideUnit provideUnit, TMakeKey makeKey, void* opaque = nullptr);
+
+    TEncCache(const TEncCache&) = delete;
+    TEncCache& operator=(const TEncCache&) = delete;
+
+    // Return the cached unit for (ch, bfu, wordlen), creating and computing
+    // it via ProvideUnit on the first request.
+    TUnit* GetOrCompute(size_t ch, size_t bfu, size_t wordlen, const float* values);
+
+    // Drop all cached units. Call at frame boundaries.
+    void Reset();
+
+private:
+    std::vector<std::unique_ptr<TUnit>> UnitBuffers; // direct-indexed by MakeKey()
+    TProvideUnit ProvideUnit;
+    TMakeKey MakeKey;
+    void* Opaque;
+};
+
+} // namespace NAtracDEnc