NOTICE: By continued use of this site you understand and agree to the binding Terms of Service and Privacy Policy.
// ==UserScript== // ==UserLibrary== // @name libStringDistance // @namespace https://openuserjs.org/users/Anakunda // @version 1.03 // @license GPL-3.0-or-later // @copyright 2021, Anakunda (https://openuserjs.org/users/Anakunda) // @description String Similarity Comparision // @exclude * // ==/UserScript== // ==/UserLibrary== function hammingDistance(strA, strB) { if (typeof strA != 'string' || typeof strB != 'string') throw 'Invalid arguments'; var d = 0, h = strA ^ strB; while (h > 0) { ++d; h &= h - 1; } return d; } function levenshteinDistance(strA, strB) { if (!strA || !strB) return 0; const matrix = [ ]; for (let i = 0; i <= strB.length; ++i) matrix[i] = [i]; for (let j = 0; j <= strA.length; ++j) matrix[0][j] = j; for (i = 1; i <= strB.length; ++i) for (j = 1; j <= strA.length; ++j) matrix[i][j] = strB.charAt(i - 1) == strA.charAt(j - 1) ? matrix[i - 1][j - 1] : Math.min(matrix[i - 1][j - 1] + 1, Math.min(matrix[i][j - 1] + 1, matrix[i - 1][j] + 1)); return matrix[strB.length][strA.length]; } const levensteinDistance = levenshteinDistance; function cosineSimilarity(strA, strB) { function termFreqMap(str) { let words = str.split(' '), termFreq = { }; words.forEach(w => { termFreq[w] = (termFreq[w] || 0) + 1 }); return termFreq; } function termFreqMapToVector(map, dict) { let termFreqVector = [ ]; for (let term in dict) termFreqVector.push(map[term] || 0); return termFreqVector; } function vecDotProduct(vecA, vecB) { let product = 0; for (let i = 0; i < vecA.length; ++i) product += vecA[i] * vecB[i]; return product; } function vecMagnitude(vec) { let sum = 0; for (let i = 0; i < vec.length; ++i) sum += vec[i] * vec[i]; return Math.sqrt(sum); } if (!strA || !strB) return 0; const addKeysToDict = (map, dict) => { for (let key in map) dict[key] = true } const _cosineSimilarity = (vecA, vecB) => vecDotProduct(vecA, vecB) / (vecMagnitude(vecA) * vecMagnitude(vecB)); let termFreqA = termFreqMap(strA), termFreqB = termFreqMap(strB), dict = { }; addKeysToDict(termFreqA, dict); addKeysToDict(termFreqB, dict); let termFreqVecA = termFreqMapToVector(termFreqA, dict), termFreqVecB = termFreqMapToVector(termFreqB, dict); return _cosineSimilarity(termFreqVecA, termFreqVecB); } function jaroWinklerSimilarity(strA, strB) { if (!strA || !strB) return 0; else if (strA == strB) return 1; let m = 0, range = (Math.floor(Math.max(strA.length, strB.length) / 2)) - 1; let s1Matches = Array(strA.length), s2Matches = Array(strB.length); for (let i = 0; i < strA.length; ++i) { const low = (i >= range) ? i - range : 0; const high = (i + range <= strB.length) ? (i + range) : (strB.length - 1); for (let j = low; j <= high; ++j) { if (s1Matches[i] === true || s2Matches[j] === true || strA[i] !== strB[j]) continue; ++m; s1Matches[i] = s2Matches[j] = true; break; } } if (m == 0) return 0; let k = 0, n_trans = 0; for (let i = 0, j; i < strA.length; ++i) { if (s1Matches[i] !== true) continue; for (j = k; j < strB.length; ++j) { if (s2Matches[j] !== true) continue; k = j + 1; break; } if (strA[i] !== strB[j]) ++n_trans; } let weight = (m / strA.length + m / strB.length + (m - (n_trans / 2)) / m) / 3, l = 0, p = 0.1; if (weight > 0.7) { while (strA[l] === strB[l] && l < 4) ++l; weight += l * p * (1 - weight); } return weight; } const jaroWrinkerSimilarity = jaroWinklerSimilarity; function trigramIndex(inputPhrases) { function asTrigrams(phrase, callback) { var rawData = " ".concat(phrase, " "); for (var i = rawData.length - 3; i >= 0; i = i - 1) callback.call(this, rawData.slice(i, i + 3)); }; if (!Array.isArray(inputPhrases)) return null; var instance = { phrases: [], trigramIndex: [], index: function(phrase) { if (!phrase || this.phrases.includes(phrase)) return; var phraseIndex = this.phrases.push(phrase) - 1; asTrigrams.call(this, phrase, function (trigram) { var phrasesForTrigram = this.trigramIndex[trigram]; if (!phrasesForTrigram) phrasesForTrigram = []; if (phrasesForTrigram.indexOf(phraseIndex) < 0) phrasesForTrigram.push(phraseIndex); this.trigramIndex[trigram] = phrasesForTrigram; }); }, find: function(phrase) { var phraseMatches = [], trigramsInPhrase = 0; asTrigrams.call(this, phrase, function(trigram) { var phrasesForTrigram = this.trigramIndex[trigram]; trigramsInPhrase += 1; if (phrasesForTrigram) for (var j in phrasesForTrigram) { let phraseIndex = phrasesForTrigram[j]; if (!phraseMatches[phraseIndex]) phraseMatches[phraseIndex] = 0; phraseMatches[phraseIndex] += 1; } }); var result = []; for (var i in phraseMatches) result.push({ phrase: this.phrases[i], matches: phraseMatches[i] }); result.sort((a, b) => b.matches - a.matches); return result; } }; for (let i in inputPhrases) instance.index(inputPhrases[i]); return instance; } const trigramCompare = (strA, strB) => trigramIndex([strA]).find(strB); // Sift4 - extended version // online algorithm to compute the distance between two strings in O(n) // maxOffset is the number of positions to search for matching tokens // options: the options for the function, allowing for customization of the scope and algorithm: // maxDistance: the distance at which the algorithm should stop computing the value and just exit (the strings are too different anyway) // tokenizer: a function to transform strings into vectors of tokens // tokenMatcher: a function to determine if two tokens are matching (equal) // matchingEvaluator: a function to determine the way a token match should be added to the local_cs. For example a fuzzy match could be implemented. // localLengthEvaluator: a function to determine the way the local_cs value is added to the lcss. For example longer continuous substrings could be awarded. // transpositionCostEvaluator: a function to determine the value of an individual transposition. For example longer transpositions should have a higher cost. // transpositionsEvaluator: a function to determine the way the total cost of transpositions affects the final result // the options can and should be implemented at a class level, but this is the demo algorithm function sift4distance(strA, strB, maxOffset = 5, options) { options = (function extend(obj, def) { let result = { }; for (let prop in def) result[prop] = (!obj || !obj.hasOwnProperty(prop) ? def : obj)[prop]; return result; })(options, { maxDistance: null, tokenizer: s => s ? s.split('') : [ ], tokenMatcher: (t1, t2) => t1 == t2, matchingEvaluator: (t1, t2) => 1, localLengthEvaluator: local_cs => local_cs, transpositionCostEvaluator: (c1, c2) => 1, transpositionsEvaluator: (lcss, trans) => lcss - trans, }); let t1 = options.tokenizer(strA), t2 = options.tokenizer(strB); let l1 = t1.length, l2 = t2.length; if (l1 == 0) return l2; if (l2 == 0) return l1; let c1 = 0, c2 = 0, lcss = 0, local_cs = 0, trans = 0, offset_arr = [ ]; while ((c1 < l1) && (c2 < l2)) { if (options.tokenMatcher(t1[c1], t2[c2])) { local_cs += options.matchingEvaluator(t1[c1], t2[c2]); //see if current match is a transposition let isTrans = false, i = 0; while (i < offset_arr.length) { let ofs = offset_arr[i]; if (c1 <= ofs.c1 || c2 <= ofs.c2) { // when two matches cross, the one considered a transposition is the one with the largest difference in offsets isTrans = Math.abs(c2 - c1) >= Math.abs(ofs.c2 - ofs.c1); if (isTrans) trans += options.transpositionCostEvaluator(c1, c2); else if (!ofs.trans) { ofs.trans = true; trans += options.transpositionCostEvaluator(ofs.c1, ofs.c2); } break; } else if (c1 > ofs.c2 && c2 > ofs.c1) offset_arr.splice(i, 1); else ++i; } offset_arr.push({ c1: c1, c2: c2, trans: isTrans }); } else { lcss += options.localLengthEvaluator(local_cs); local_cs = 0; if (c1 != c2) c1 = c2 = Math.min(c1, c2); // using min allows the computation of transpositions if (options.maxDistance) { let temporaryDistance = options.localLengthEvaluator(Math.max(c1, c2)) - options.transpositionsEvaluator(lcss, trans); if (temporaryDistance > options.maxDistance) return Math.round(temporaryDistance); } // if matching tokens are found, remove 1 from both cursors (they get incremented at the end of the loop) // so that we can have only one code block handling matches for (let i = 0; i < maxOffset && (c1 + i < l1 || c2 + i < l2); i++) { if ((c1 + i < l1) && options.tokenMatcher(t1[c1 + i], t2[c2])) { c1 += i - 1; c2--; break; } if ((c2 + i < l2) && options.tokenMatcher(t1[c1], t2[c2 + i])) { c1--; c2 += i - 1; break; } } } ++c1; ++c2; // this covers the case where the last match is on the last token in list, so that it can compute transpositions correctly if ((c1 >= l1) || (c2 >= l2)) { lcss += options.localLengthEvaluator(local_cs); local_cs = 0; c1 = c2 = Math.min(c1, c2); } } lcss += options.localLengthEvaluator(local_cs); return Math.round(options.localLengthEvaluator(Math.max(l1, l2)) - options.transpositionsEvaluator(lcss, trans)); //add the cost of found transpositions } // possible values for the options // tokenizers: function nGramTokenizer(s, n) { //tokenizer:function(s) { return nGramTokenizer(s,2); } let result = [ ]; if (s) for (let i = 0; i <= s.length - n; ++i) result.push(s.substr(i, n)); return result; } const wordSplitTokenizer = s => s ? s.split(/\s+/) : [ ]; function characterFrequencyTokenizer(s) { //tokenizer:characterFrequencyTokenizer (letters only) let result = [ ]; for (let i = 0; i < 26; ++i) { let val = 0; if (s) for (let j = 0; j < s.length; ++j) { let code = s.charCodeAt(j); if (code == i + 65 || code == i + 97) ++val; } result.push(val); } return result; } // tokenMatchers: const sift4TokenMatcher = (t1, t2) => (1 - sift4distance(t1, t2, 5) / Math.max(t1.length, t2.length)) > 0.7; // matchingEvaluators: const sift4MatchingEvaluator = (t1, t2) => 1 - sift4distance(t1, t2, 5) / Math.max(t1.length, t2.length); // localLengthEvaluators: const rewardLengthEvaluator = l => l < 1 ? l : l - 1 / (l + 1); //1 -> 0.5, 2-> 0.66, 9 -> 0.9 const rewardLengthEvaluator2 = l => Math.pow(l, 1.5); // 0 -> 0, 1 -> 1, 2 -> 2.83, 10 -> 31.62 // transpositionCostEvaluators: const longerTranspositionsAreMoreCostly = (c1, c2) => Math.abs(c2 - c1) / 9 + 1; ////////////////////////////////////////////////////////////////////////// ////////////////////////////// SAFE PADDING ////////////////////////////// //////////////////////////////////////////////////////////////////////////