aboutsummaryrefslogblamecommitdiffstats
path: root/library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp
blob: 6256a841c2f496576362f1b9879d1c91485fa302 (plain) (tree)
1
2
3
4
5
6
7
8
9
10
                             
 
                                                  
 




                                         
     

                                        
     

                                                       
     

                                                      
     

 
                               

                                                                                                                
                                                                                                        
     






                                                                                                                       
 

                                       
                                       
                              
                                                                                                                                             



                                                  
                                       
                              
                                                                                                                                   






                                                  
                                       
                              
                                                                                                                                       






                                                  
                                       
                              
                                                                                                                                       








                                                  
                                                
                                       
                                                                                                                                       
                                                                     



                                                                                                                    
 
                                                
                                       
                                                                                                                                       







                                                                     
                                       
                              
                                                                                                                                          






                                                    
                                       
                              
                                                                                                                                       






                                                  
                                       
                              
                                                                                                                                         
                                                  
                                                                                                          





                                                 
                                       
                              
                                                                                                                                         
                                                  
                                                                                                          





                                                             
                                       
                              
                                                                                                                                          
                                                  
                                                                                                         





                                                             
                                       
                              
                                                                                                                                          
                                                  
                                                                                                         











                                                            
                                       
                              
                                                                                                                                 
                                                  
                                                                                                          
                                                 


















                                                                                                                 
 
#include "levenshtein_diff.h"

#include <library/cpp/testing/unittest/registar.h>

#include <util/generic/string.h>

namespace {

    float unaryZeroWeigher(const char&) {
        return 0.0f;
    }

    float unaryMaxWeigher(const char&) {
        return 1.0f;
    }

    float binaryZeroWeigher(const char&, const char&) {
        return 0.0f;
    }

    float binaryMaxWeigher(const char&, const char&) {
        return 1.0f;
    }

}

Y_UNIT_TEST_SUITE(Levenstein) {
    Y_UNIT_TEST(Distance) {
        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("knight"), TStringBuf("king")), 4);
        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("life flies"), TStringBuf("fly lives")), 6);
        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("spot trail"), TStringBuf("stop trial")), 4);
        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("hello"), TStringBuf("hulloah")), 3);
        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("yeoman"), TStringBuf("yo man")), 2);
    }

    Y_UNIT_TEST(DamerauDistance) {
        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("knight"), TStringBuf("king")), 3);
        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("life flies"), TStringBuf("fly lives")), 6);
        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("spot trail"), TStringBuf("stop trial")), 3);
        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("status"), TStringBuf("tsatus")), 1);
        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("hello"), TStringBuf("heh lol")), 3);
    }
}

Y_UNIT_TEST_SUITE(WeightedLevenstein) {
    Y_UNIT_TEST(EqualStrings) {
        NLevenshtein::TEditChain chain;
        float distance = 0.0f;
        NLevenshtein::GetEditChain(TString("12345"), TString("12345"), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher);
        UNIT_ASSERT_VALUES_EQUAL(distance, 0.0f);
        UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
    }

    Y_UNIT_TEST(EmptyStrings) {
        NLevenshtein::TEditChain chain;
        float distance = 0.0f;
        NLevenshtein::GetEditChain(TString(""), TString(""), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher);
        UNIT_ASSERT_VALUES_EQUAL(distance, 0.0f);
        UNIT_ASSERT_VALUES_EQUAL(chain.size(), 0);
    }

    Y_UNIT_TEST(InsertsOnly) {
        auto unaryWeigher = [](const char&) {
            return 2.0f;
        };
        NLevenshtein::TEditChain chain;
        float distance = 0.0f;
        NLevenshtein::GetEditChain(TString(""), TString("12345"), chain, &distance, binaryZeroWeigher, unaryZeroWeigher, unaryWeigher);
        UNIT_ASSERT_VALUES_EQUAL(distance, 10.0f);
        UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
    }

    Y_UNIT_TEST(DeletionsOnly) {
        auto unaryWeigher = [](const char&) {
            return 3.0f;
        };
        NLevenshtein::TEditChain chain;
        float distance = 0.0f;
        NLevenshtein::GetEditChain(TString("54321"), TString(""), chain, &distance, binaryZeroWeigher, unaryWeigher, unaryZeroWeigher);
        UNIT_ASSERT_VALUES_EQUAL(distance, 15.0f);
        UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
    }

    Y_UNIT_TEST(SymmetryCheck) {
        const TString str1 = "123x5";
        const TString str2 = "x2345";
        const float trgDistance = 2.0f;
        const size_t trgChainLen = 5;

        NLevenshtein::TEditChain chainLeftRight;
        float distanceLeftRight = 0.0f;
        NLevenshtein::GetEditChain(str1, str2, chainLeftRight, &distanceLeftRight, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher);
        UNIT_ASSERT_VALUES_EQUAL(distanceLeftRight, trgDistance);
        UNIT_ASSERT_VALUES_EQUAL(chainLeftRight.size(), trgChainLen);
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[0]), static_cast<int>(NLevenshtein::EMT_REPLACE));
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[1]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[2]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[3]), static_cast<int>(NLevenshtein::EMT_REPLACE));
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[4]), static_cast<int>(NLevenshtein::EMT_PRESERVE));

        NLevenshtein::TEditChain chainRightLeft;
        float distanceRightLeft = 0.0f;
        NLevenshtein::GetEditChain(str2, str1, chainRightLeft, &distanceRightLeft, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher);
        UNIT_ASSERT_VALUES_EQUAL(distanceRightLeft, trgDistance);
        UNIT_ASSERT_VALUES_EQUAL(chainRightLeft.size(), trgChainLen);
        UNIT_ASSERT(chainRightLeft == chainLeftRight);
    }

    Y_UNIT_TEST(PreferReplacements) {
        auto binaryWeigher = [](const char&, const char&) {
            return 0.0625f;
        };
        NLevenshtein::TEditChain chain;
        float distance = 0.0f;
        NLevenshtein::GetEditChain(TString("54321"), TString("43210"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher);
        UNIT_ASSERT_VALUES_EQUAL(distance, 0.3125f);
        UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
    }

    Y_UNIT_TEST(PreferInsertDeletions) {
        auto unaryWeigher = [](const char&) {
            return 0.0625f;
        };
        NLevenshtein::TEditChain chain;
        float distance = 0.0f;
        NLevenshtein::GetEditChain(TString("54321"), TString("98765"), chain, &distance, binaryMaxWeigher, unaryWeigher, unaryWeigher);
        UNIT_ASSERT_VALUES_EQUAL(distance, 0.5f);
        UNIT_ASSERT_VALUES_EQUAL(chain.size(), 9);
    }

    Y_UNIT_TEST(NoXDeletions) {
        auto unaryWeigher = [](const char& c) {
            return c == 'x' ? 100.0f : 1.0f;
        };
        NLevenshtein::TEditChain chain;
        float distance = 0.0f;
        NLevenshtein::GetEditChain(TString("543x1"), TString("5431"), chain, &distance, binaryMaxWeigher, unaryWeigher, unaryMaxWeigher);
        UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_REPLACE));
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_DELETE));
        UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f);
    }

    Y_UNIT_TEST(NoXInsertions) {
        auto unaryWeigher = [](const char& c) {
            return c == 'x' ? 100.0f : 1.0f;
        };
        NLevenshtein::TEditChain chain;
        float distance = 0.0f;
        NLevenshtein::GetEditChain(TString("5431"), TString("543x1"), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryWeigher);
        UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_REPLACE));
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_INSERT));
        UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f);
    }

    Y_UNIT_TEST(NoReplacementsOfX) {
        auto binaryWeigher = [](const char& l, const char&) {
            return l == 'x' ? 100.0f : 1.0f;
        };
        NLevenshtein::TEditChain chain;
        float distance = 0.0f;
        NLevenshtein::GetEditChain(TString("5432x"), TString("5432y"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher);
        UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6);
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_DELETE));
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[5]), static_cast<int>(NLevenshtein::EMT_INSERT));
        UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f);
    }

    Y_UNIT_TEST(NoReplacementsForX) {
        auto binaryWeigher = [](const char&, const char& r) {
            return r == 'x' ? 100.0f : 1.0f;
        };
        NLevenshtein::TEditChain chain;
        float distance = 0.0f;
        NLevenshtein::GetEditChain(TString("y4321"), TString("x4321"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher);
        UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6);
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_DELETE));
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_INSERT));
        UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f);
    }

    Y_UNIT_TEST(SimilarOperationPriorities) {
        auto replaceWeigher = [](const char&, const char&) {
            return 0.5f;
        };
        auto deleteWeigher = [](const char&) {
            return 0.2f;
        };
        auto insertWeigher = [](const char&) {
            return 0.9f;
        };
        NLevenshtein::TEditChain chain;
        float distance = 0.0f;
        NLevenshtein::GetEditChain(TString("y0"), TString("0x"), chain, &distance, replaceWeigher, deleteWeigher, insertWeigher);
        UNIT_ASSERT_VALUES_EQUAL(chain.size(), 2);
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_REPLACE));
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_REPLACE));
        UNIT_ASSERT_VALUES_EQUAL(distance, 1.0f);
    }

    Y_UNIT_TEST(DamerauLevenshtein) {
        int distance;
        NLevenshtein::TEditChain chain;
        NLevenshtein::GetEditChainGeneric<TString, true>(TString("status"), TString("tsatus"), chain, &distance);
        UNIT_ASSERT_VALUES_EQUAL(distance, 1);
        UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_TRANSPOSE));
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_PRESERVE));

        NLevenshtein::GetEditChainGeneric<TString, true>(TString("hello"), TString("heh lol"), chain, &distance);
        UNIT_ASSERT_VALUES_EQUAL(distance, 3);
        UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6);
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[2]), static_cast<int>(NLevenshtein::EMT_INSERT));
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_INSERT));
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
        UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[5]), static_cast<int>(NLevenshtein::EMT_TRANSPOSE));
    }
}