NOTICE: By continued use of this site you understand and agree to the binding Terms of Service and Privacy Policy.
// ==UserScript==
// ==UserLibrary==
// @name libStringDistance
// @namespace https://openuserjs.org/users/Anakunda
// @version 1.03
// @license GPL-3.0-or-later
// @copyright 2021, Anakunda (https://openuserjs.org/users/Anakunda)
// @description String Similarity Comparision
// @exclude *
// ==/UserScript==
// ==/UserLibrary==
function hammingDistance(strA, strB) {
if (typeof strA != 'string' || typeof strB != 'string') throw 'Invalid arguments';
var d = 0, h = strA ^ strB;
while (h > 0) { ++d; h &= h - 1; }
return d;
}
function levenshteinDistance(strA, strB) {
if (!strA || !strB) return 0;
const matrix = [ ];
for (let i = 0; i <= strB.length; ++i) matrix[i] = [i];
for (let j = 0; j <= strA.length; ++j) matrix[0][j] = j;
for (i = 1; i <= strB.length; ++i) for (j = 1; j <= strA.length; ++j)
matrix[i][j] = strB.charAt(i - 1) == strA.charAt(j - 1) ? matrix[i - 1][j - 1]
: Math.min(matrix[i - 1][j - 1] + 1, Math.min(matrix[i][j - 1] + 1, matrix[i - 1][j] + 1));
return matrix[strB.length][strA.length];
}
const levensteinDistance = levenshteinDistance;
function cosineSimilarity(strA, strB) {
function termFreqMap(str) {
let words = str.split(' '), termFreq = { };
words.forEach(w => { termFreq[w] = (termFreq[w] || 0) + 1 });
return termFreq;
}
function termFreqMapToVector(map, dict) {
let termFreqVector = [ ];
for (let term in dict) termFreqVector.push(map[term] || 0);
return termFreqVector;
}
function vecDotProduct(vecA, vecB) {
let product = 0;
for (let i = 0; i < vecA.length; ++i) product += vecA[i] * vecB[i];
return product;
}
function vecMagnitude(vec) {
let sum = 0;
for (let i = 0; i < vec.length; ++i) sum += vec[i] * vec[i];
return Math.sqrt(sum);
}
if (!strA || !strB) return 0;
const addKeysToDict = (map, dict) => { for (let key in map) dict[key] = true }
const _cosineSimilarity = (vecA, vecB) => vecDotProduct(vecA, vecB) / (vecMagnitude(vecA) * vecMagnitude(vecB));
let termFreqA = termFreqMap(strA), termFreqB = termFreqMap(strB), dict = { };
addKeysToDict(termFreqA, dict);
addKeysToDict(termFreqB, dict);
let termFreqVecA = termFreqMapToVector(termFreqA, dict), termFreqVecB = termFreqMapToVector(termFreqB, dict);
return _cosineSimilarity(termFreqVecA, termFreqVecB);
}
function jaroWinklerSimilarity(strA, strB) {
if (!strA || !strB) return 0; else if (strA == strB) return 1;
let m = 0, range = (Math.floor(Math.max(strA.length, strB.length) / 2)) - 1;
let s1Matches = Array(strA.length), s2Matches = Array(strB.length);
for (let i = 0; i < strA.length; ++i) {
const low = (i >= range) ? i - range : 0;
const high = (i + range <= strB.length) ? (i + range) : (strB.length - 1);
for (let j = low; j <= high; ++j) {
if (s1Matches[i] === true || s2Matches[j] === true || strA[i] !== strB[j]) continue;
++m;
s1Matches[i] = s2Matches[j] = true;
break;
}
}
if (m == 0) return 0;
let k = 0, n_trans = 0;
for (let i = 0, j; i < strA.length; ++i) {
if (s1Matches[i] !== true) continue;
for (j = k; j < strB.length; ++j) {
if (s2Matches[j] !== true) continue;
k = j + 1;
break;
}
if (strA[i] !== strB[j]) ++n_trans;
}
let weight = (m / strA.length + m / strB.length + (m - (n_trans / 2)) / m) / 3, l = 0, p = 0.1;
if (weight > 0.7) {
while (strA[l] === strB[l] && l < 4) ++l;
weight += l * p * (1 - weight);
}
return weight;
}
const jaroWrinkerSimilarity = jaroWinklerSimilarity;
function trigramIndex(inputPhrases) {
function asTrigrams(phrase, callback) {
var rawData = " ".concat(phrase, " ");
for (var i = rawData.length - 3; i >= 0; i = i - 1) callback.call(this, rawData.slice(i, i + 3));
};
if (!Array.isArray(inputPhrases)) return null;
var instance = {
phrases: [],
trigramIndex: [],
index: function(phrase) {
if (!phrase || this.phrases.includes(phrase)) return;
var phraseIndex = this.phrases.push(phrase) - 1;
asTrigrams.call(this, phrase, function (trigram) {
var phrasesForTrigram = this.trigramIndex[trigram];
if (!phrasesForTrigram) phrasesForTrigram = [];
if (phrasesForTrigram.indexOf(phraseIndex) < 0) phrasesForTrigram.push(phraseIndex);
this.trigramIndex[trigram] = phrasesForTrigram;
});
},
find: function(phrase) {
var phraseMatches = [], trigramsInPhrase = 0;
asTrigrams.call(this, phrase, function(trigram) {
var phrasesForTrigram = this.trigramIndex[trigram];
trigramsInPhrase += 1;
if (phrasesForTrigram) for (var j in phrasesForTrigram) {
let phraseIndex = phrasesForTrigram[j];
if (!phraseMatches[phraseIndex]) phraseMatches[phraseIndex] = 0;
phraseMatches[phraseIndex] += 1;
}
});
var result = [];
for (var i in phraseMatches) result.push({ phrase: this.phrases[i], matches: phraseMatches[i] });
result.sort((a, b) => b.matches - a.matches);
return result;
}
};
for (let i in inputPhrases) instance.index(inputPhrases[i]);
return instance;
}
const trigramCompare = (strA, strB) => trigramIndex([strA]).find(strB);
// Sift4 - extended version
// online algorithm to compute the distance between two strings in O(n)
// maxOffset is the number of positions to search for matching tokens
// options: the options for the function, allowing for customization of the scope and algorithm:
// maxDistance: the distance at which the algorithm should stop computing the value and just exit (the strings are too different anyway)
// tokenizer: a function to transform strings into vectors of tokens
// tokenMatcher: a function to determine if two tokens are matching (equal)
// matchingEvaluator: a function to determine the way a token match should be added to the local_cs. For example a fuzzy match could be implemented.
// localLengthEvaluator: a function to determine the way the local_cs value is added to the lcss. For example longer continuous substrings could be awarded.
// transpositionCostEvaluator: a function to determine the value of an individual transposition. For example longer transpositions should have a higher cost.
// transpositionsEvaluator: a function to determine the way the total cost of transpositions affects the final result
// the options can and should be implemented at a class level, but this is the demo algorithm
function sift4distance(strA, strB, maxOffset = 5, options) {
options = (function extend(obj, def) {
let result = { };
for (let prop in def) result[prop] = (!obj || !obj.hasOwnProperty(prop) ? def : obj)[prop];
return result;
})(options, {
maxDistance: null,
tokenizer: s => s ? s.split('') : [ ],
tokenMatcher: (t1, t2) => t1 == t2,
matchingEvaluator: (t1, t2) => 1,
localLengthEvaluator: local_cs => local_cs,
transpositionCostEvaluator: (c1, c2) => 1,
transpositionsEvaluator: (lcss, trans) => lcss - trans,
});
let t1 = options.tokenizer(strA), t2 = options.tokenizer(strB);
let l1 = t1.length, l2 = t2.length;
if (l1 == 0) return l2;
if (l2 == 0) return l1;
let c1 = 0, c2 = 0, lcss = 0, local_cs = 0, trans = 0, offset_arr = [ ];
while ((c1 < l1) && (c2 < l2)) {
if (options.tokenMatcher(t1[c1], t2[c2])) {
local_cs += options.matchingEvaluator(t1[c1], t2[c2]);
//see if current match is a transposition
let isTrans = false, i = 0;
while (i < offset_arr.length) {
let ofs = offset_arr[i];
if (c1 <= ofs.c1 || c2 <= ofs.c2) {
// when two matches cross, the one considered a transposition is the one with the largest difference in offsets
isTrans = Math.abs(c2 - c1) >= Math.abs(ofs.c2 - ofs.c1);
if (isTrans) trans += options.transpositionCostEvaluator(c1, c2);
else if (!ofs.trans) {
ofs.trans = true;
trans += options.transpositionCostEvaluator(ofs.c1, ofs.c2);
}
break;
} else if (c1 > ofs.c2 && c2 > ofs.c1) offset_arr.splice(i, 1); else ++i;
}
offset_arr.push({ c1: c1, c2: c2, trans: isTrans });
} else {
lcss += options.localLengthEvaluator(local_cs);
local_cs = 0;
if (c1 != c2) c1 = c2 = Math.min(c1, c2); // using min allows the computation of transpositions
if (options.maxDistance) {
let temporaryDistance = options.localLengthEvaluator(Math.max(c1, c2)) -
options.transpositionsEvaluator(lcss, trans);
if (temporaryDistance > options.maxDistance) return Math.round(temporaryDistance);
}
// if matching tokens are found, remove 1 from both cursors (they get incremented at the end of the loop)
// so that we can have only one code block handling matches
for (let i = 0; i < maxOffset && (c1 + i < l1 || c2 + i < l2); i++) {
if ((c1 + i < l1) && options.tokenMatcher(t1[c1 + i], t2[c2])) { c1 += i - 1; c2--; break; }
if ((c2 + i < l2) && options.tokenMatcher(t1[c1], t2[c2 + i])) { c1--; c2 += i - 1; break; }
}
}
++c1; ++c2;
// this covers the case where the last match is on the last token in list, so that it can compute transpositions correctly
if ((c1 >= l1) || (c2 >= l2)) {
lcss += options.localLengthEvaluator(local_cs);
local_cs = 0;
c1 = c2 = Math.min(c1, c2);
}
}
lcss += options.localLengthEvaluator(local_cs);
return Math.round(options.localLengthEvaluator(Math.max(l1, l2)) - options.transpositionsEvaluator(lcss, trans)); //add the cost of found transpositions
}
// possible values for the options
// tokenizers:
function nGramTokenizer(s, n) { //tokenizer:function(s) { return nGramTokenizer(s,2); }
let result = [ ];
if (s) for (let i = 0; i <= s.length - n; ++i) result.push(s.substr(i, n));
return result;
}
const wordSplitTokenizer = s => s ? s.split(/\s+/) : [ ];
function characterFrequencyTokenizer(s) { //tokenizer:characterFrequencyTokenizer (letters only)
let result = [ ];
for (let i = 0; i < 26; ++i) {
let val = 0;
if (s) for (let j = 0; j < s.length; ++j) {
let code = s.charCodeAt(j);
if (code == i + 65 || code == i + 97) ++val;
}
result.push(val);
}
return result;
}
// tokenMatchers:
const sift4TokenMatcher = (t1, t2) => (1 - sift4distance(t1, t2, 5) / Math.max(t1.length, t2.length)) > 0.7;
// matchingEvaluators:
const sift4MatchingEvaluator = (t1, t2) => 1 - sift4distance(t1, t2, 5) / Math.max(t1.length, t2.length);
// localLengthEvaluators:
const rewardLengthEvaluator = l => l < 1 ? l : l - 1 / (l + 1); //1 -> 0.5, 2-> 0.66, 9 -> 0.9
const rewardLengthEvaluator2 = l => Math.pow(l, 1.5); // 0 -> 0, 1 -> 1, 2 -> 2.83, 10 -> 31.62
// transpositionCostEvaluators:
const longerTranspositionsAreMoreCostly = (c1, c2) => Math.abs(c2 - c1) / 9 + 1;
//////////////////////////////////////////////////////////////////////////
////////////////////////////// SAFE PADDING //////////////////////////////
//////////////////////////////////////////////////////////////////////////