#pragma once
#include <util/draft/matrix.h>
#include <util/generic/algorithm.h>
#include <util/generic/vector.h>
#include <util/system/yassert.h>
#include <type_traits>
#include <utility>
namespace NLevenshtein {
enum EEditMoveType {
EMT_SPECIAL,
EMT_PRESERVE,
EMT_REPLACE,
EMT_DELETE,
EMT_INSERT,
EMT_TRANSPOSE
};
inline bool IsImportantEditMove(EEditMoveType p) {
return (p != EMT_SPECIAL && p != EMT_PRESERVE);
}
inline void MakeMove(EEditMoveType t, int& p1, int& p2) {
switch (t) {
case EMT_TRANSPOSE:
p1 += 2;
p2 += 2;
break;
case EMT_PRESERVE:
case EMT_REPLACE:
p1++;
p2++;
break;
case EMT_DELETE:
p1++;
break;
case EMT_INSERT:
p2++;
break;
default:
break;
}
}
using TEditChain = TVector<EEditMoveType>;
template <typename TArgType, typename TWeightType = int>
struct TWeightOneUnaryGetter {
TWeightType operator()(const TArgType&) const {
return 1;
}
};
template <typename TArgType, typename TWeightType = int>
struct TWeightOneBinaryGetter {
TWeightType operator()(const TArgType&, const TArgType&) const {
return 1;
}
};
template <typename TArgType, typename TWeightType = int>
struct TWeightInfBinaryGetter {
TWeightType operator()(const TArgType&, const TArgType&) const {
return std::numeric_limits<TWeightType>::max();
}
};
template <typename TStringType>
using TCharType = typename std::decay_t<decltype(std::add_const_t<TStringType>()[0])>;
/// Finds sequence of "edit moves" for two strings
template <class TStringType, class TWeightType = int,
class TReplaceWeigher = TWeightOneBinaryGetter<TCharType<TStringType>, TWeightType>,
class TDeleteWeigher = TWeightOneUnaryGetter<TCharType<TStringType>, TWeightType>,
class TInsertWeigher = TWeightOneUnaryGetter<TCharType<TStringType>, TWeightType>,
class TTransposeWeigher = TWeightInfBinaryGetter<TCharType<TStringType>, TWeightType>
>
void GetEditChain(const TStringType& str1, const TStringType& str2, TEditChain& res, TWeightType* weight = nullptr,
const TReplaceWeigher& replaceWeigher = TReplaceWeigher(),
const TDeleteWeigher& deleteWeigher = TDeleteWeigher(),
const TInsertWeigher& insertWeigher = TInsertWeigher(),
const TTransposeWeigher& transposeWeigher = TTransposeWeigher())
{
int l1 = (int)str1.size();
int l2 = (int)str2.size();
TMatrix<std::pair<TWeightType, EEditMoveType>> ma(l1 + 1, l2 + 1); /// ma[i][j].first = diff(str1[0..i-1], str2[0..j-1])
ma[0][0] = std::make_pair(0, EMT_SPECIAL); // starting point
for (int i = 1; i <= l1; i++) {
ma[i][0] = std::make_pair(ma[i - 1][0].first + deleteWeigher(str1[i - 1]), EMT_DELETE);
}
for (int i = 1; i <= l2; i++) {
ma[0][i] = std::make_pair(ma[0][i - 1].first + insertWeigher(str2[i - 1]), EMT_INSERT);
}
const TWeightType maxWeight = std::numeric_limits<TWeightType>::max();
// Here goes basic Damerau-Levenshtein's algorithm
for (int i = 1; i <= l1; i++) {
for (int j = 1; j <= l2; j++) {
if (str1[i - 1] == str2[j - 1]) {
ma[i][j] = std::make_pair(ma[i - 1][j - 1].first, EMT_PRESERVE);
} else {
const TWeightType replaceWeight = replaceWeigher(str1[i - 1], str2[j - 1]);
Y_ASSERT(replaceWeight >= 0);
if (replaceWeight < maxWeight) {
ma[i][j] = std::make_pair(ma[i - 1][j - 1].first + replaceWeight, EMT_REPLACE);
} else {
ma[i][j] = std::make_pair(replaceWeight, EMT_REPLACE);
}
}
if (ma[i][j].first > ma[i - 1][j].first) {
const TWeightType deleteWeight = deleteWeigher(str1[i - 1]);
Y_ASSERT(deleteWeight >= 0);
if (deleteWeight < maxWeight) {
const TWeightType deletePathWeight = ma[i - 1][j].first + deleteWeight;
if (deletePathWeight <= ma[i][j].first) {
ma[i][j] = std::make_pair(deletePathWeight, EMT_DELETE);
}
}
}
if (ma[i][j].first > ma[i][j - 1].first) {
const TWeightType insertWeight = insertWeigher(str2[j - 1]);
Y_ASSERT(insertWeight >= 0);
if (insertWeight < maxWeight) {
const TWeightType insertPathWeight = ma[i][j - 1].first + insertWeight;
if (insertPathWeight <= ma[i][j].first) {
ma[i][j] = std::make_pair(insertPathWeight, EMT_INSERT);
}
}
}
if (i > 1 && j > 1 && str1[i - 1] == str2[j - 2] && str1[i - 2] == str2[j - 1]) {
const TWeightType transposeWeight = transposeWeigher(str1[i - 2], str2[j - 2]);
Y_ASSERT(transposeWeight >= 0);
if (transposeWeight < maxWeight) {
const TWeightType transposePathWeight = ma[i - 2][j - 2].first + transposeWeight;
if (transposePathWeight <= ma[i][j].first) {
ma[i][j] = std::make_pair(transposePathWeight, EMT_TRANSPOSE);
}
}
}
}
}
// Tracing the path from final point
res.clear();
res.reserve(Max<size_t>(l1, l2));
for (int i = l1, j = l2; ma[i][j].second != EMT_SPECIAL;) {
res.push_back(ma[i][j].second);
switch (ma[i][j].second) {
case EMT_TRANSPOSE:
i -= 2;
j -= 2;
break;
case EMT_PRESERVE:
case EMT_REPLACE:
--i;
--j;
break;
case EMT_DELETE:
--i;
break;
case EMT_INSERT:
--j;
break;
default:
// TODO: throw exception
break;
}
}
std::reverse(res.begin(), res.end());
if (weight != nullptr) {
*weight = ma[l1][l2].first;
}
}
template <class TStringType, bool Damerau = false, class TWeightType = int>
void GetEditChainGeneric(const TStringType& str1, const TStringType& str2, TEditChain& res, TWeightType* weight = nullptr) {
typedef TCharType<TStringType> TArgType;
GetEditChain<
TStringType, TWeightType,
TWeightOneBinaryGetter<TArgType, TWeightType>,
TWeightOneUnaryGetter<TArgType, TWeightType>,
TWeightOneUnaryGetter<TArgType, TWeightType>,
std::conditional_t<
Damerau,
TWeightOneBinaryGetter<TArgType, TWeightType>,
TWeightInfBinaryGetter<TArgType, TWeightType>
>
>(str1, str2, res, weight);
}
template <class TStringType, bool Damerau = false>
size_t DistanceImpl(const TStringType& str1, const TStringType& str2) {
if (str1.size() > str2.size()) {
return DistanceImpl<TStringType, Damerau>(str2, str1);
}
size_t size1 = str1.size();
size_t size2 = str2.size();
TVector<size_t> currentRow(size1 + 1);
TVector<size_t> previousRow(size1 + 1);
TVector<size_t> transpositionRow(size1 + 1);
for (size_t i = 0; i <= size1; ++i) {
previousRow[i] = i;
}
for (size_t i = 1; i <= size2; ++i) {
currentRow[0] = i;
for (size_t j = 1; j <= size1; ++j) {
size_t cost = str1[j - 1] == str2[i - 1] ? 0 : 1;
currentRow[j] = std::min(previousRow[j - 1] + cost, std::min(currentRow[j - 1], previousRow[j]) + 1);
if (Damerau && i > 1 && j > 1 && str1[j - 2] == str2[i - 1] && str1[j - 1] == str2[i - 2]) {
currentRow[j] = std::min(currentRow[j], transpositionRow[j - 2] + cost);
}
}
if (Damerau) {
std::swap(transpositionRow, previousRow);
}
std::swap(previousRow, currentRow);
}
return previousRow[size1];
}
template <class TStringType>
size_t Distance(const TStringType& str1, const TStringType& str2) {
return DistanceImpl<TStringType, false>(str1, str2);
}
template <class TStringType>
size_t DamerauDistance(const TStringType& str1, const TStringType& str2) {
return DistanceImpl<TStringType, true>(str1, str2);
}
/// Calculates substrings to be replaced for str1->str2 transformation
struct TReplacement {
int CorrectOffset, CorrectLength, MisspelledOffset, MisspelledLength;
TReplacement()
: CorrectOffset(0)
, CorrectLength(0)
, MisspelledOffset(0)
, MisspelledLength(0)
{
}
TReplacement(int correctOffset, int correctLength, int misspelledOffset, int misspelledLength)
: CorrectOffset(correctOffset)
, CorrectLength(correctLength)
, MisspelledOffset(misspelledOffset)
, MisspelledLength(misspelledLength)
{
}
};
template <class TStringType>
void GetStringReplacements(const TStringType& str1, const TStringType& str2, TVector<TReplacement>& res) {
TEditChain editChain;
GetEditChain(str1, str2, editChain);
editChain.push_back(EMT_SPECIAL);
int c1 = 0, c2 = 0;
res.clear();
for (TEditChain::const_iterator it = editChain.begin(); it != editChain.end(); it++) {
if (IsImportantEditMove(*it)) {
int sc1 = c1, sc2 = c2;
do {
MakeMove(*it, c1, c2);
++it;
} while (IsImportantEditMove(*it));
res.push_back(TReplacement(sc1, c1 - sc1, sc2, c2 - sc2));
}
MakeMove(*it, c1, c2);
}
}
}