package com.citydo.xclouddesk.utils;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.extra.tokenizer.Result;
import cn.hutool.extra.tokenizer.TokenizerEngine;
import cn.hutool.extra.tokenizer.TokenizerUtil;
import cn.hutool.extra.tokenizer.Word;
import cn.hutool.extra.tokenizer.engine.hanlp.HanLPEngine;
import cn.hutool.extra.tokenizer.engine.jieba.JiebaEngine;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import com.hankcs.hanlp.HanLP;
import java.io.IOException;
import java.io.StringReader;
import java.util.*;
import java.util.stream.Collectors;
@Slf4j
public class ParticipleUtils {
public static double THRESHOLD = 0.2;
public static Vector<String> participleIk(String text) {
Vector<String> str = new Vector<>();
try {
StringReader reader = new StringReader(text);
IKSegmenter ik = new IKSegmenter(reader, true);
Lexeme lexeme = null;
while ((lexeme = ik.next()) != null) {
str.add(lexeme.getLexemeText());
}
if (str.size() == 0) {
return null;
}
log.info("str分词后:" + str);
} catch (IOException e) {
log.error(e.getMessage());
}
return str;
}
public static Vector<String> participleJieBa(String text) {
JiebaEngine engine = new JiebaEngine();
Result results = engine.parse(text);
String result = CollUtil.join((Iterator<Word>) results, ",");
return new Vector<>(Arrays.asList(result.split(",")));
}
public static Vector<String> participleChinese(String text){
TokenizerEngine engine = TokenizerUtil.createEngine();
Result results = engine.parse(text);
String result = CollUtil.join((Iterator<Word>)results, ",");
return new Vector<>(Arrays.asList(result.split(",")));
}
public static Vector<String> participleHanLP(String text){
TokenizerEngine engine = new HanLPEngine();
Result results = engine.parse(text);
String result = CollUtil.join((Iterator<Word>)results, ",");
return new Vector<>(Arrays.asList(result.split(",")));
}
public static double getSimilarity(Vector<String> TOne, Vector<String> TTwo) throws Exception {
int sizeOne = 0, sizeTwo = 0;
if (TOne != null && (sizeOne = TOne.size()) > 0 && TTwo != null && (sizeTwo = TTwo.size()) > 0) {
Map<String, double[]> T = new HashMap<>();
String index = null;
for (int i = 0; i < sizeOne; i++) {
index = TOne.get(i);
if (index != null) {
double[] c = T.get(index);
c = new double[2];
c[0] = 1;
c[1] = THRESHOLD;
T.put(index, c);
}
}
for (int i = 0; i < sizeTwo; i++) {
index = TTwo.get(i);
if (index != null) {
double[] c = T.get(index);
if (c != null && c.length == 2) {
c[1] = 1;
} else {
c = new double[2];
c[0] = THRESHOLD;
c[1] = 1;
T.put(index, c);
}
}
}
Iterator<String> it = T.keySet().iterator();
double sOne = 0, sTwo = 0, Ssum = 0;
while (it.hasNext()) {
double[] c = T.get(it.next());
Ssum += c[0] * c[1];
sOne += c[0] * c[0];
sTwo += c[1] * c[1];
}
return Ssum / Math.sqrt(sOne * sTwo);
} else {
throw new Exception("传入参数有问题!");
}
}
public static double findSimilarity(String sentenceOne, String sentenceTwo) {
List<String> sentOneWords = getSplitWords(sentenceOne);
List<String> sentTwoWords = getSplitWords(sentenceTwo);
List<String> allWords = mergeList(sentOneWords, sentTwoWords);
int[] statisticOne = statistic(allWords, sentOneWords);
int[] statisticTwo = statistic(allWords, sentTwoWords);
double dividend = 0;
double divisor1 = 0;
double divisor2 = 0;
int length = statisticOne.length;
for (int i = 0; i < length; i++) {
dividend += statisticOne[i] * statisticTwo[i];
divisor1 += Math.pow(statisticOne[i], 2);
divisor2 += Math.pow(statisticTwo[i], 2);
}
return dividend / (Math.sqrt(divisor1) * Math.sqrt(divisor2));
}
private static int[] statistic(List<String> allWords, List<String> sentWords) {
int[] result = new int[allWords.size()];
int size = allWords.size();
for (int i = 0; i < size; i++) {
result[i] = Collections.frequency(sentWords, allWords.get(i));
}
return result;
}
private static List<String> mergeList(List<String> listOne, List<String> listTwo) {
List<String> result = new ArrayList<>();
result.addAll(listOne);
result.addAll(listTwo);
return result.stream().distinct().collect(Collectors.toList());
}
private static List<String> getSplitWords(String sentence) {
sentence = Jsoup.parse(sentence.replace(" ","")).body().text();
return HanLP.segment(sentence).stream().map(a -> a.word).filter(s -> !"`~!@#$^&*()=|{}':;',\\[\\].<>/?~!@#¥……&*()——|{}【】‘;:”“'。,、? ".contains(s)).collect(Collectors.toList());
}
}
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.1.1</version>
</dependency>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.7.5</version>
</dependency>