using EnglishStemmer; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Runtime.Serialization.Formatters.Binary; using System.Text; using System.Text.RegularExpressions; public class StringCompare { /// <summary> /// Document vocabulary, containing each word's IDF value. /// </summary> private Dictionary<string, double> _vocabularyIDF = new Dictionary<string, double>(); public string[] stopWordsList = new string[] { "a", "about", "above", "across", "afore", "aforesaid", "after", "again", "against", "agin", "ago", "aint", "albeit", "all", "almost", "alone", "along", "alongside", "already", "also", "although", "always", "am", "american", "amid", "amidst", "among", "amongst", "an", "and", "anent", "another", "any", "anybody", "anyone", "anything", "are", "aren't", "around", "as", "aslant", "astride", "at", "athwart", "away", "b", "back", "bar", "barring", "be", "because", "been", "before", "behind", "being", "below", "beneath", "beside", "besides", "best", "better", "between", "betwixt", "beyond", "both", "but", "by", "c", "can", "cannot", "can't", "certain", "circa", "close", "concerning", "considering", "cos", "could", "couldn't", "couldst", "d", "dare", "dared", "daren't", "dares", "daring", "despite", "did", "didn't", "different", "directly", "do", "does", "doesn't", "doing", "done", "don't", "dost", "doth", "down", "during", "durst", "e", "each", "early", "either", "em", "english", "enough", "ere", "even", "ever", "every", "everybody", "everyone", "everything", "except", "excepting", "f", "failing", "far", "few", "first", "five", "following", "for", "four", "from", "g", "gonna", "gotta", "h", "had", "hadn't", "hard", "has", "hasn't", "hast", "hath", "have", "haven't", "having", "he", "he'd", "he'll", "her", "here", "here's", "hers", "herself", "he's", "high", "him", "himself", "his", "home", "how", "howbeit", "however", "how's", "i", "id", "if", "ill", "i'm", "immediately", "important", "in", "inside", "instantly", "into", "is", "isn't", "it", "it'll", "it's", "its", "itself", "i've", "j", "just", "k", "l", "large", "last", "later", "least", "left", "less", "lest", "let's", "like", "likewise", "little", "living", "long", "m", "many", "may", "mayn't", "me", "mid", "midst", "might", "mightn't", "mine", "minus", "more", "most", "much", "must", "mustn't", "my", "myself", "n", "near", "'neath", "need", "needed", "needing", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "nigh", "nigher", "nighest", "nisi", "no", "no-one", "nobody", "none", "nor", "not", "nothing", "notwithstanding", "now", "o", "o'er", "of", "off", "often", "on", "once", "one", "oneself", "only", "onto", "open", "or", "other", "otherwise", "ought", "oughtn't", "our", "ours", "ourselves", "out", "outside", "over", "own", "p", "past", "pending", "per", "perhaps", "plus", "possible", "present", "probably", "provided", "providing", "public", "q", "qua", "quite", "r", "rather", "re", "real", "really", "respecting", "right", "round", "s", "same", "sans", "save", "saving", "second", "several", "shall", "shalt", "shan't", "she", "shed", "shell", "she's", "short", "should", "shouldn't", "since", "six", "small", "so", "some", "somebody", "someone", "something", "sometimes", "soon", "special", "still", "such", "summat", "supposing", "sure", "t", "than", "that", "that'd", "that'll", "that's", "the", "thee", "their", "theirs", "their's", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "thine", "this", "tho", "those", "thou", "though", "three", "thro'", "through", "throughout", "thru", "thyself", "till", "to", "today", "together", "too", "touching", "toward", "towards", "true", "'twas", "'tween", "'twere", "'twill", "'twixt", "two", "'twould", "u", "under", "underneath", "unless", "unlike", "until", "unto", "up", "upon", "us", "used", "usually", "v", "versus", "very", "via", "vice", "vis-a-vis", "w", "wanna", "wanting", "was", "wasn't", "way", "we", "we'd", "well", "were", "weren't", "wert", "we've", "what", "whatever", "what'll", "what's", "when", "whencesoever", "whenever", "when's", "whereas", "where's", "whether", "which", "whichever", "whichsoever", "while", "whilst", "who", "who'd", "whoever", "whole", "who'll", "whom", "whore", "who's", "whose", "whoso", "whosoever", "will", "with", "within", "without", "wont", "would", "wouldn't", "wouldst", "x", "y", "ye", "yet", "you", "you'd", "you'll", "your", "you're", "yours", "yourself", "yourselves", "you've", "z", }; /// <summary> /// Transforms a list of documents into their associated TF*IDF values. /// If a vocabulary does not yet exist, one will be created, based upon the documents' words. /// </summary> /// <param name="documents">string[]</param> /// <param name="vocabularyThreshold">Minimum number of occurences of the term within all documents</param> /// <returns>double[][]</returns> public double[][] Transform(string[] documents, int vocabularyThreshold = 3) { List<List<string>> stemmedDocs; List<string> vocabulary; // Get the vocabulary and stem the documents at the same time. vocabulary = GetVocabulary(documents, out stemmedDocs, vocabularyThreshold); if (_vocabularyIDF.Count == 0) { // Calculate the IDF for each vocabulary term. foreach (var term in vocabulary) { double numberOfDocsContainingTerm = stemmedDocs.Where(d => d.Contains(term)).Count(); _vocabularyIDF[term] = Math.Log((double)stemmedDocs.Count / ((double)1 + numberOfDocsContainingTerm)); } } // Transform each document into a vector of tfidf values. return TransformToTFIDFVectors(stemmedDocs, _vocabularyIDF); } /// <summary> /// 预处理 /// </summary> /// <param name="documents"></param> /// <returns></returns> public List<List<double>> Preprocessing(string[] documents) { List<List<string>> stemmedDocs; List<string> vocabulary; List<List<double>> r = new List<List<double>>(); vocabulary = GetVocabulary(documents, out stemmedDocs, 0); foreach (List<string> lt in stemmedDocs) { List<double> temp = new List<double>(); foreach (var item in vocabulary) { if (lt.Contains(item)) { temp.Add(1); } else { temp.Add(0); } } r.Add(temp); } return r; } /// <summary> /// 对比字符串相似度 /// </summary> /// <param name="str1"></param> /// <param name="str2"></param> /// <returns></returns> public double StringCompare(string str1, string str2) { //提取特征点 List<List<double>> inputs = Preprocessing(new string[] { str1.ToLower(), str2.ToLower() }); //求两向量弧度 double RadianVal = Radian(inputs[0].ToArray(), inputs[1].ToArray()); //弧度转角度 double Degree = (180 / Math.PI) * RadianVal; return (90 - Degree) / 90.0; } /// <summary> /// 向量归一化 /// </summary> /// <param name="d"></param> /// <returns></returns> private double[] guiyi(double[] d) { List<double> nd = new List<double>(); double sum = 0; foreach (double item in d) { sum += item * item; } double length = Math.Sqrt(sum); foreach (double item in d) { if (length != 0.0) { nd.Add(item / length); } else { nd.Add(0); } } return nd.ToArray(); } /// <summary> /// 求两个向量的弧度 /// </summary> /// <param name="d1"></param> /// <param name="d2"></param> /// <returns></returns> private double Radian(double[] d1, double[] d2) { //归一 double[] nd1 = guiyi(d1); double[] nd2 = guiyi(d2); //向量点积 double sum = 0; for (int i = 0; i < d1.Length; i++) { sum += nd1[i] * nd2[i]; } sum = Math.Round(sum, 4); double aa = Math.Acos(sum); return Math.Acos(sum); } /// <summary> /// Converts a list of stemmed documents (lists of stemmed words) and their associated vocabulary + idf values, into an array of TF*IDF values. /// </summary> /// <param name="stemmedDocs">List of List of string</param> /// <param name="vocabularyIDF">Dictionary of string, double (term, IDF)</param> /// <returns>double[][]</returns> private double[][] TransformToTFIDFVectors(List<List<string>> stemmedDocs, Dictionary<string, double> vocabularyIDF) { // Transform each document into a vector of tfidf values. List<List<double>> vectors = new List<List<double>>(); foreach (var doc in stemmedDocs) { List<double> vector = new List<double>(); foreach (var vocab in vocabularyIDF) { // Term frequency = count how many times the term appears in this document. double tf = doc.Where(d => d == vocab.Key).Count(); double tfidf = tf * vocab.Value; vector.Add(tfidf); } vectors.Add(vector); } return vectors.Select(v => v.ToArray()).ToArray(); } /// <summary> /// Normalizes a TF*IDF array of vectors using L2-Norm. /// Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2) /// </summary> /// <param name="vectors">double[][]</param> /// <returns>double[][]</returns> public double[][] Normalize(double[][] vectors) { // Normalize the vectors using L2-Norm. List<double[]> normalizedVectors = new List<double[]>(); foreach (var vector in vectors) { var normalized = Normalize(vector); normalizedVectors.Add(normalized); } return normalizedVectors.ToArray(); } /// <summary> /// Normalizes a TF*IDF vector using L2-Norm. /// Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2) /// </summary> /// <param name="vectors">double[][]</param> /// <returns>double[][]</returns> public double[] Normalize(double[] vector) { List<double> result = new List<double>(); double sumSquared = 0; foreach (var value in vector) { sumSquared += value * value; } double SqrtSumSquared = Math.Sqrt(sumSquared); foreach (var value in vector) { // L2-norm: Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2) result.Add(value / SqrtSumSquared); } return result.ToArray(); } /// <summary> /// Saves the TFIDF vocabulary to disk. /// </summary> /// <param name="filePath">File path</param> public void Save(string filePath = "vocabulary.dat") { // Save result to disk. using (FileStream fs = new FileStream(filePath, FileMode.Create)) { BinaryFormatter formatter = new BinaryFormatter(); formatter.Serialize(fs, _vocabularyIDF); } } /// <summary> /// Loads the TFIDF vocabulary from disk. /// </summary> /// <param name="filePath">File path</param> public void Load(string filePath = "vocabulary.dat") { // Load from disk. using (FileStream fs = new FileStream(filePath, FileMode.Open)) { BinaryFormatter formatter = new BinaryFormatter(); _vocabularyIDF = (Dictionary<string, double>)formatter.Deserialize(fs); } } #region Private Helpers /// <summary> /// Parses and tokenizes a list of documents, returning a vocabulary of words. /// </summary> /// <param name="docs">string[]</param> /// <param name="stemmedDocs">List of List of string</param> /// <returns>Vocabulary (list of strings)</returns> private List<string> GetVocabulary(string[] docs, out List<List<string>> stemmedDocs, int vocabularyThreshold) { List<string> vocabulary = new List<string>(); Dictionary<string, int> wordCountList = new Dictionary<string, int>(); stemmedDocs = new List<List<string>>(); int docIndex = 0; foreach (var doc in docs) { List<string> stemmedDoc = new List<string>(); docIndex++; if (docIndex % 100 == 0) { //Console.WriteLine("Processing " + docIndex + "/" + docs.Length); } string[] parts2 = Tokenize(doc); List<string> words = new List<string>(); foreach (string part in parts2) { // Strip non-alphanumeric characters. string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", ""); if (!stopWordsList.Contains(stripped.ToLower())) { try { var english = new EnglishWord(stripped); string stem = english.Stem; words.Add(stem); if (stem.Length > 0) { // Build the word count list. if (wordCountList.ContainsKey(stem)) { wordCountList[stem]++; } else { wordCountList.Add(stem, 0); } stemmedDoc.Add(stem); } } catch { } } } stemmedDocs.Add(stemmedDoc); } // Get the top words. var vocabList = wordCountList.Where(w => w.Value >= vocabularyThreshold); foreach (var item in vocabList) { vocabulary.Add(item.Key); } return vocabulary; } /// <summary> /// Tokenizes a string, returning its list of words. /// </summary> /// <param name="text">string</param> /// <returns>string[]</returns> private string[] Tokenize(string text) { // Strip all HTML. text = Regex.Replace(text, "<[^<>]+>", ""); // Strip numbers. text = Regex.Replace(text, "[0-9]+", "number"); // Strip urls. text = Regex.Replace(text, @"(http|https)://[^\s]*", "httpaddr"); // Strip email addresses. text = Regex.Replace(text, @"[^\s]+@[^\s]+", "emailaddr"); // Strip dollar sign. text = Regex.Replace(text, "[$]+", "dollar"); // Strip usernames. text = Regex.Replace(text, @"@[^\s]+", "username"); // Tokenize and also get rid of any punctuation return text.Split(" @$/#.-:&*+=[]?!(){},''\">_<;%\\".ToCharArray()); } #endregion }
double aa = new StringCompare().StringCompare(str1,str2); return new SqlDouble (aa);