Anakunda / libStringDistance

// ==UserScript==
// ==UserLibrary==
// @name         libStringDistance
// @namespace    https://openuserjs.org/users/Anakunda
// @version      1.03
// @license      GPL-3.0-or-later
// @copyright    2021, Anakunda (https://openuserjs.org/users/Anakunda)
// @description  String Similarity Comparision
// @exclude      *
// ==/UserScript==
// ==/UserLibrary==

function hammingDistance(strA, strB) {
	if (typeof strA != 'string' || typeof strB != 'string') throw 'Invalid arguments';
	var d = 0, h = strA ^ strB;
	while (h > 0) { ++d; h &= h - 1; }
	return d;
}

function levenshteinDistance(strA, strB) {
    if (!strA || !strB) return 0;
	const matrix = [ ];
	for (let i = 0; i <= strB.length; ++i) matrix[i] = [i];
	for (let j = 0; j <= strA.length; ++j) matrix[0][j] = j;
	for (i = 1; i <= strB.length; ++i) for (j = 1; j <= strA.length; ++j)
	    matrix[i][j] = strB.charAt(i - 1) == strA.charAt(j - 1) ? matrix[i - 1][j - 1]
            : Math.min(matrix[i - 1][j - 1] + 1, Math.min(matrix[i][j - 1] + 1, matrix[i - 1][j] + 1));
	return matrix[strB.length][strA.length];
}
const levensteinDistance = levenshteinDistance;

function cosineSimilarity(strA, strB) {
	function termFreqMap(str) {
		let words = str.split(' '), termFreq = { };
		words.forEach(w => { termFreq[w] = (termFreq[w] || 0) + 1 });
		return termFreq;
	}
	function termFreqMapToVector(map, dict) {
		let termFreqVector = [ ];
		for (let term in dict) termFreqVector.push(map[term] || 0);
		return termFreqVector;
	}
	function vecDotProduct(vecA, vecB) {
		let product = 0;
		for (let i = 0; i < vecA.length; ++i) product += vecA[i] * vecB[i];
		return product;
	}
	function vecMagnitude(vec) {
		let sum = 0;
		for (let i = 0; i < vec.length; ++i) sum += vec[i] * vec[i];
		return Math.sqrt(sum);
	}

    if (!strA || !strB) return 0;
	const addKeysToDict = (map, dict) => { for (let key in map) dict[key] = true }
	const _cosineSimilarity = (vecA, vecB) => vecDotProduct(vecA, vecB) / (vecMagnitude(vecA) * vecMagnitude(vecB));
	let termFreqA = termFreqMap(strA), termFreqB = termFreqMap(strB), dict = { };
	addKeysToDict(termFreqA, dict);
	addKeysToDict(termFreqB, dict);
	let termFreqVecA = termFreqMapToVector(termFreqA, dict), termFreqVecB = termFreqMapToVector(termFreqB, dict);
	return _cosineSimilarity(termFreqVecA, termFreqVecB);
}

function jaroWinklerSimilarity(strA, strB) {
    if (!strA || !strB) return 0; else if (strA == strB) return 1;
	let m = 0, range = (Math.floor(Math.max(strA.length, strB.length) / 2)) - 1;
	let s1Matches = Array(strA.length), s2Matches = Array(strB.length);
	for (let i = 0; i < strA.length; ++i) {
		const low = (i >= range) ? i - range : 0;
		const high = (i + range <= strB.length) ? (i + range) : (strB.length - 1);
		for (let j = low; j <= high; ++j) {
			if (s1Matches[i] === true || s2Matches[j] === true || strA[i] !== strB[j]) continue;
			++m;
			s1Matches[i] = s2Matches[j] = true;
			break;
		}
	}
	if (m == 0) return 0;
	let k = 0, n_trans = 0;
	for (let i = 0, j; i < strA.length; ++i) {
		if (s1Matches[i] !== true) continue;
		for (j = k; j < strB.length; ++j) {
			if (s2Matches[j] !== true) continue;
			k = j + 1;
			break;
		}
		if (strA[i] !== strB[j]) ++n_trans;
	}
	let weight = (m / strA.length + m / strB.length + (m - (n_trans / 2)) / m) / 3, l = 0, p = 0.1;
	if (weight > 0.7) {
		while (strA[l] === strB[l] && l < 4) ++l;
		weight += l * p * (1 - weight);
	}
	return weight;
}
const jaroWrinkerSimilarity = jaroWinklerSimilarity;

function trigramIndex(inputPhrases) {
	function asTrigrams(phrase, callback) {
		var rawData = "  ".concat(phrase, "  ");
		for (var i = rawData.length - 3; i >= 0; i = i - 1) callback.call(this, rawData.slice(i, i + 3));
	};

	if (!Array.isArray(inputPhrases)) return null;
	var instance = {
		phrases: [],
		trigramIndex: [],

		index: function(phrase) {
			if (!phrase || this.phrases.includes(phrase)) return;
			var phraseIndex = this.phrases.push(phrase) - 1;
			asTrigrams.call(this, phrase, function (trigram) {
				var phrasesForTrigram = this.trigramIndex[trigram];
				if (!phrasesForTrigram) phrasesForTrigram = [];
				if (phrasesForTrigram.indexOf(phraseIndex) < 0) phrasesForTrigram.push(phraseIndex);
				this.trigramIndex[trigram] = phrasesForTrigram;
			});
		},

		find: function(phrase) {
			var phraseMatches = [], trigramsInPhrase = 0;
			asTrigrams.call(this, phrase, function(trigram) {
				var phrasesForTrigram = this.trigramIndex[trigram];
				trigramsInPhrase += 1;
				if (phrasesForTrigram) for (var j in phrasesForTrigram) {
					let phraseIndex = phrasesForTrigram[j];
					if (!phraseMatches[phraseIndex]) phraseMatches[phraseIndex] = 0;
					phraseMatches[phraseIndex] += 1;
				}
			});
			var result = [];
			for (var i in phraseMatches) result.push({ phrase: this.phrases[i], matches: phraseMatches[i] });
			result.sort((a, b) => b.matches - a.matches);
			return result;
		}
	};
	for (let i in inputPhrases) instance.index(inputPhrases[i]);
	return instance;
}
const trigramCompare = (strA, strB) => trigramIndex([strA]).find(strB);

// Sift4 - extended version
// online algorithm to compute the distance between two strings in O(n)
// maxOffset is the number of positions to search for matching tokens
// options: the options for the function, allowing for customization of the scope and algorithm:
//         maxDistance: the distance at which the algorithm should stop computing the value and just exit (the strings are too different anyway)
//         tokenizer: a function to transform strings into vectors of tokens
//        tokenMatcher: a function to determine if two tokens are matching (equal)
//        matchingEvaluator: a function to determine the way a token match should be added to the local_cs. For example a fuzzy match could be implemented.
//        localLengthEvaluator: a function to determine the way the local_cs value is added to the lcss. For example longer continuous substrings could be awarded.
//        transpositionCostEvaluator: a function to determine the value of an individual transposition. For example longer transpositions should have a higher cost.
//        transpositionsEvaluator: a function to determine the way the total cost of transpositions affects the final result
// the options can and should be implemented at a class level, but this is the demo algorithm
function sift4distance(strA, strB, maxOffset = 5, options) {
	options = (function extend(obj, def) {
		let result = { };
		for (let prop in def) result[prop] = (!obj || !obj.hasOwnProperty(prop) ? def : obj)[prop];
		return result;
	})(options, {
		maxDistance: null,
		tokenizer: s => s ? s.split('') : [ ],
		tokenMatcher: (t1, t2) => t1 == t2,
		matchingEvaluator: (t1, t2) => 1,
		localLengthEvaluator: local_cs => local_cs,
		transpositionCostEvaluator: (c1, c2) => 1,
		transpositionsEvaluator: (lcss, trans) => lcss - trans,
	});

	let t1 = options.tokenizer(strA), t2 = options.tokenizer(strB);
	let l1 = t1.length, l2 = t2.length;
	if (l1 == 0) return l2;
	if (l2 == 0) return l1;
	let c1 = 0, c2 = 0, lcss = 0, local_cs = 0, trans = 0, offset_arr = [ ];
	while ((c1 < l1) && (c2 < l2)) {
		if (options.tokenMatcher(t1[c1], t2[c2])) {
			local_cs += options.matchingEvaluator(t1[c1], t2[c2]);
			//see if current match is a transposition
			let isTrans = false, i = 0;
			while (i < offset_arr.length) {
				let ofs = offset_arr[i];
				if (c1 <= ofs.c1 || c2 <= ofs.c2) {
					// when two matches cross, the one considered a transposition is the one with the largest difference in offsets
					isTrans = Math.abs(c2 - c1) >= Math.abs(ofs.c2 - ofs.c1);
					if (isTrans) trans += options.transpositionCostEvaluator(c1, c2);
					else if (!ofs.trans) {
						ofs.trans = true;
						trans += options.transpositionCostEvaluator(ofs.c1, ofs.c2);
					}
					break;
				} else if (c1 > ofs.c2 && c2 > ofs.c1) offset_arr.splice(i, 1); else ++i;
			}
			offset_arr.push({ c1: c1, c2: c2, trans: isTrans });
		} else {
			lcss += options.localLengthEvaluator(local_cs);
			local_cs = 0;
			if (c1 != c2) c1 = c2 = Math.min(c1, c2); // using min allows the computation of transpositions
			if (options.maxDistance) {
				let temporaryDistance = options.localLengthEvaluator(Math.max(c1, c2)) -
					options.transpositionsEvaluator(lcss, trans);
				if (temporaryDistance > options.maxDistance) return Math.round(temporaryDistance);
			}
			// if matching tokens are found, remove 1 from both cursors (they get incremented at the end of the loop)
			// so that we can have only one code block handling matches
			for (let i = 0; i < maxOffset && (c1 + i < l1 || c2 + i < l2); i++) {
				if ((c1 + i < l1) && options.tokenMatcher(t1[c1 + i], t2[c2])) { c1 += i - 1; c2--; break; }
				if ((c2 + i < l2) && options.tokenMatcher(t1[c1], t2[c2 + i])) { c1--; c2 += i - 1; break; }
			}
		}
		++c1; ++c2;
		// this covers the case where the last match is on the last token in list, so that it can compute transpositions correctly
		if ((c1 >= l1) || (c2 >= l2)) {
			lcss += options.localLengthEvaluator(local_cs);
			local_cs = 0;
			c1 = c2 = Math.min(c1, c2);
		}
	}
	lcss += options.localLengthEvaluator(local_cs);
	return Math.round(options.localLengthEvaluator(Math.max(l1, l2)) - options.transpositionsEvaluator(lcss, trans)); //add the cost of found transpositions
}

// possible values for the options

// tokenizers:
function nGramTokenizer(s, n) { //tokenizer:function(s) { return nGramTokenizer(s,2); }
	let result = [ ];
	if (s) for (let i = 0; i <= s.length - n; ++i) result.push(s.substr(i, n));
	return result;
}

const wordSplitTokenizer = s => s ? s.split(/\s+/) : [ ];

function characterFrequencyTokenizer(s) { //tokenizer:characterFrequencyTokenizer (letters only)
	let result = [ ];
	for (let i = 0; i < 26; ++i) {
		let val = 0;
		if (s) for (let j = 0; j < s.length; ++j) {
			let code = s.charCodeAt(j);
			if (code == i + 65 || code == i + 97) ++val;
		}
		result.push(val);
	}
	return result;
}

// tokenMatchers:
const sift4TokenMatcher = (t1, t2) => (1 - sift4distance(t1, t2, 5) / Math.max(t1.length, t2.length)) > 0.7;

// matchingEvaluators:
const sift4MatchingEvaluator = (t1, t2) => 1 - sift4distance(t1, t2, 5) / Math.max(t1.length, t2.length);

// localLengthEvaluators:
const rewardLengthEvaluator = l => l < 1 ? l : l - 1 / (l + 1); //1 -> 0.5, 2-> 0.66, 9 -> 0.9
const rewardLengthEvaluator2 = l => Math.pow(l, 1.5); // 0 -> 0, 1 -> 1, 2 -> 2.83, 10 -> 31.62

// transpositionCostEvaluators:
const longerTranspositionsAreMoreCostly = (c1, c2) => Math.abs(c2 - c1) / 9 + 1;

//////////////////////////////////////////////////////////////////////////
////////////////////////////// SAFE PADDING //////////////////////////////
//////////////////////////////////////////////////////////////////////////