Add similarity comparison to String
Uses the Sorensen-Dice coefficient to calculate similarity. This also adds String.bigrams() as a convenience function needed by the comparison.
This commit is contained in:
parent
f47d7b27cb
commit
1b8d0a16b7
3 changed files with 50 additions and 0 deletions
|
@ -2810,6 +2810,50 @@ bool String::_base_is_subsequence_of(const String& p_string, bool case_insensiti
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Vector<String> String::bigrams() const {
|
||||||
|
int n_pairs = length() - 1;
|
||||||
|
Vector<String> b;
|
||||||
|
if(n_pairs <= 0) {
|
||||||
|
return b;
|
||||||
|
}
|
||||||
|
b.resize(n_pairs);
|
||||||
|
for(int i = 0; i < n_pairs; i++) {
|
||||||
|
b[i] = substr(i,2);
|
||||||
|
}
|
||||||
|
return b;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Similarity according to Sorensen-Dice coefficient
|
||||||
|
float String::similarity(const String& p_string) const {
|
||||||
|
if(operator==(p_string)) {
|
||||||
|
// Equal strings are totally similar
|
||||||
|
return 1.0f;
|
||||||
|
}
|
||||||
|
if (length() < 2 || p_string.length() < 2) {
|
||||||
|
// No way to calculate similarity without a single bigram
|
||||||
|
return 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
Vector<String> src_bigrams = bigrams();
|
||||||
|
Vector<String> tgt_bigrams = p_string.bigrams();
|
||||||
|
|
||||||
|
int src_size = src_bigrams.size();
|
||||||
|
int tgt_size = tgt_bigrams.size();
|
||||||
|
|
||||||
|
float sum = src_size + tgt_size;
|
||||||
|
float inter = 0;
|
||||||
|
for (int i = 0; i < src_size; i++) {
|
||||||
|
for (int j = 0; j < tgt_size; j++) {
|
||||||
|
if (src_bigrams[i] == tgt_bigrams[j]) {
|
||||||
|
inter++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (2.0f * inter)/sum;
|
||||||
|
}
|
||||||
|
|
||||||
static bool _wildcard_match(const CharType* p_pattern, const CharType* p_string,bool p_case_sensitive) {
|
static bool _wildcard_match(const CharType* p_pattern, const CharType* p_string,bool p_case_sensitive) {
|
||||||
switch (*p_pattern) {
|
switch (*p_pattern) {
|
||||||
case '\0':
|
case '\0':
|
||||||
|
|
|
@ -123,6 +123,8 @@ public:
|
||||||
bool ends_with(const String& p_string) const;
|
bool ends_with(const String& p_string) const;
|
||||||
bool is_subsequence_of(const String& p_string) const;
|
bool is_subsequence_of(const String& p_string) const;
|
||||||
bool is_subsequence_ofi(const String& p_string) const;
|
bool is_subsequence_ofi(const String& p_string) const;
|
||||||
|
Vector<String> bigrams() const;
|
||||||
|
float similarity(const String& p_string) const;
|
||||||
String replace_first(String p_key,String p_with) const;
|
String replace_first(String p_key,String p_with) const;
|
||||||
String replace(String p_key,String p_with) const;
|
String replace(String p_key,String p_with) const;
|
||||||
String replacen(String p_key,String p_with) const;
|
String replacen(String p_key,String p_with) const;
|
||||||
|
|
|
@ -249,6 +249,8 @@ static void _call_##m_type##_##m_method(Variant& r_ret,Variant& p_self,const Var
|
||||||
VCALL_LOCALMEM1R(String,ends_with);
|
VCALL_LOCALMEM1R(String,ends_with);
|
||||||
VCALL_LOCALMEM1R(String,is_subsequence_of);
|
VCALL_LOCALMEM1R(String,is_subsequence_of);
|
||||||
VCALL_LOCALMEM1R(String,is_subsequence_ofi);
|
VCALL_LOCALMEM1R(String,is_subsequence_ofi);
|
||||||
|
VCALL_LOCALMEM0R(String,bigrams);
|
||||||
|
VCALL_LOCALMEM1R(String,similarity);
|
||||||
VCALL_LOCALMEM2R(String,replace);
|
VCALL_LOCALMEM2R(String,replace);
|
||||||
VCALL_LOCALMEM2R(String,replacen);
|
VCALL_LOCALMEM2R(String,replacen);
|
||||||
VCALL_LOCALMEM2R(String,insert);
|
VCALL_LOCALMEM2R(String,insert);
|
||||||
|
@ -1274,6 +1276,8 @@ _VariantCall::addfunc(Variant::m_vtype,Variant::m_ret,_SCS(#m_method),VCALL(m_cl
|
||||||
ADDFUNC1(STRING,BOOL,String,ends_with,STRING,"text",varray());
|
ADDFUNC1(STRING,BOOL,String,ends_with,STRING,"text",varray());
|
||||||
ADDFUNC1(STRING,BOOL,String,is_subsequence_of,STRING,"text",varray());
|
ADDFUNC1(STRING,BOOL,String,is_subsequence_of,STRING,"text",varray());
|
||||||
ADDFUNC1(STRING,BOOL,String,is_subsequence_ofi,STRING,"text",varray());
|
ADDFUNC1(STRING,BOOL,String,is_subsequence_ofi,STRING,"text",varray());
|
||||||
|
ADDFUNC0(STRING,STRING_ARRAY,String,bigrams,varray());
|
||||||
|
ADDFUNC1(STRING,REAL,String,similarity,STRING,"text",varray());
|
||||||
|
|
||||||
ADDFUNC2(STRING,STRING,String,replace,STRING,"what",STRING,"forwhat",varray());
|
ADDFUNC2(STRING,STRING,String,replace,STRING,"what",STRING,"forwhat",varray());
|
||||||
ADDFUNC2(STRING,STRING,String,replacen,STRING,"what",STRING,"forwhat",varray());
|
ADDFUNC2(STRING,STRING,String,replacen,STRING,"what",STRING,"forwhat",varray());
|
||||||
|
|
Loading…
Reference in a new issue