JAVA文章相似度对比（hanlp）

83 阅读 0 评论 55 点赞

我是靠谱客的博主俊逸自行车，最近开发中收集的这篇文章主要介绍JAVA文章相似度对比（hanlp），觉得挺不错的，现在分享给大家，希望可以做个参考。

概述

采用hanlp进行分词

需要下载：.jar和data文件夹和.properties

pom依赖引入

<dependency>
    <groupId>com.hankcs</groupId>
    <artifactId>hanlp</artifactId>
    <version>1.7.6</version>
</dependency>
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.11.3</version>
</dependency>

修改hanlp.properties（data文件夹路径）

package com.thinkgem.jeesite.modules.lgslt.utils.check;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import org.apache.commons.compress.utils.Lists;

import com.hankcs.hanlp.HanLP;
import com.thinkgem.jeesite.common.utils.StringUtils;
import com.thinkgem.jeesite.modules.flfg.entity.FlfgContent;
import com.thinkgem.jeesite.modules.flfg.entity.FlfgSentence;

import lombok.extern.log4j.Log4j;

/**
 * 
 * <p>Title: ContSimilarUtil </p>
 * <p>Description:  文本对比 工具类</p>
 * @author chenlf
 * @date 2020年7月21日
 */
@Log4j
public class ContSimilarUtil {
	
	/** 关键词个数 ：整篇 */
	public static final int CONTENT_KEYWORD_SIZE = 100;
	/** 关键词个数 ：条款 */
	public static final int SENTENCE_KEYWORD_SIZE = 20;
	/** 摘要个数 */
	public static final int SUMMARY_SIZE = 20;
	/**
	 * 
	 * @title getKeyWords
	 * @Description 获取文本关键词
	 * @author chenlf
	 * @param text
	 * @param size
	 * @return
	 */
	public static String getKeyWords(String text, int size){
		List<String> keywords = HanLP.extractKeyword(text,size);
		return StringUtils.join(keywords, ",");
	}
	
	/**
	 * 
	 * @title getSummarys
	 * @Description 获取文本摘要
	 * @author chenlf
	 * @param text
	 * @param size
	 * @return
	 */
	public static String getSummarys(String text, int size){
		List<String> summarys = HanLP.extractSummary(text, size);
		return StringUtils.join(summarys, "n");
	}
	
	public static List<String> getByRegex(String regex, String text) {
		/*String start = "《"; String end = "》";*/
		//String regex = start + "(.*?)" + end;
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(text);
		List<String> titles = new ArrayList<>();
		while (matcher.find()) {
			String group = matcher.group(); // 如果用matcher.group(1)表示截取第一个括号中间的内容，结果不会包括#
			//group = group.replaceAll(start, "").replaceAll(end, "");
			titles.add(group);
		}
		return titles;
	}
	
	/**
	 * 
	 * @title getByRegexSymbol
	 * @Description 根据标识符截取
	 * @author chenlf
	 * @param start 开始标识符
	 * @param end 结束标识符
	 * @param text 被截取对象
	 * @return
	 */
	public static List<String> getByRegexSymbol(String start,String end, String text) {
		/*String start = "《"; String end = "》";*/
		String regex = start + "(.*?)" + end;
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(text);
		List<String> titles = new ArrayList<>();
		while (matcher.find()) {
			String group = matcher.group(); // 如果用matcher.group(1)表示截取第一个括号中间的内容，结果不会包括#
			group = group.replaceAll(start, "").replaceAll(end, "");
			titles.add(group);
		}
		return titles;
	}
	
	/**
	 * @title getFlfgContentSimilarList
	 * @Description 整篇文章比对。相似度
	 * @author chenlf
	 * @param flfgContentList
	 * @param n 截取的条数。如果 n=-1(n小于0)则返回所有数据
	 * @param likeValueMin 相似度值的大小 0-1之间
	 * @return
	 */
	public static List<FlfgContent> getFlfgContentSimilarList(String keywords,
			List<FlfgContent> flfgContentList, int n, double likeValueMin) {
		
		log.debug("---- 文章相似度比较开始 ----");
		
		List<FlfgContent> results = Lists.newArrayList();
		List<String> sent1Words = Arrays.asList(keywords.split(","));
		//比较
		for (FlfgContent beCompare : flfgContentList) {
			//关键词组 
			//List<String> sent2Words = HanLP.extractKeyword(beCompare.getContent(),KEYWORD_SIZE);
			List<String> sent2Words = Arrays.asList(beCompare.getKeyword().split(","));
			
			Double likeValue = getSimilarity(sent1Words, sent2Words);
			if (likeValue >= likeValueMin) {
				likeValue = Double.valueOf(String.format("%.2f", likeValue*100));
				beCompare.setLikeValue(likeValue);
				results.add(beCompare);
			}
		}
		log.debug("---- 文章相似度比较结束 ----");
		//排序
		log.debug("---- 文章相似度比较 排序 开始 ----");
		Collections.sort(results, new Comparator<FlfgContent>() {
			public int compare(FlfgContent o1, FlfgContent o2) {
				return o2.getLikeValue().compareTo(o1.getLikeValue());
			}
		});
		log.debug("---- 文章相似度比较 排序 结束 ----");
		//截取
		if (n > 0) {
			if (null != results && results.size() > n) {
				return results.subList(0, n);
			}
		}
		return results;
	}
	
	/**
	 * 
	 * @title getFlfgSentenceSimilarList
	 * @Description 按条计算相似度
	 * @author chenlf
	 * @param compare 
	 * @param list
	 * @param n 返回的条数
	 * @return
	 */
	public static List<FlfgSentence> getFlfgSentenceSimilarList(String compare,
			List<FlfgSentence> list, int n) {
		//关键词
        List<String> sent1Words = HanLP.extractKeyword(compare, SENTENCE_KEYWORD_SIZE);
		//比对
		List<FlfgSentence> results = Lists.newArrayList();
		for (FlfgSentence beCompare : list) {
			//关键词组
			//List<String> sent2Words = HanLP.extractKeyword(beCompare.getSentence(), KEYWORD_SIZE);
			List<String> sent2Words = Arrays.asList(beCompare.getKeyword().split(","));
			beCompare.setLikeValue(getSimilarity(sent1Words, sent2Words));
			results.add(beCompare);
		}
		//排序
		Collections.sort(results, new Comparator<FlfgSentence>() {
			public int compare(FlfgSentence o1, FlfgSentence o2) {
				return o2.getLikeValue().compareTo(o1.getLikeValue());
			}
		});
		//截取
		if (n > 0) {
			if (null != results && results.size() > n) {
				return results.subList(0, n);
			}
		}
		return results;
	}
	
	/**
	 * 
	 * @title getSimilarity
	 * @Description 计算相似度
	 * @param sent1Words
	 * @param sent2Words
	 * @return
	 */
	public static double getSimilarity(List<String> sent1Words, List<String> sent2Words) {
		List<String> allWords = mergeList(sent1Words, sent2Words);// 上一步中得到的所有词

		int[] statistic1 = statistic(allWords, sent1Words);
		int[] statistic2 = statistic(allWords, sent2Words);

		double dividend = 0;
		double divisor1 = 0;
		double divisor2 = 0;
		for (int i = 0; i < statistic1.length; i++) {
			dividend += statistic1[i] * statistic2[i];
			divisor1 += Math.pow(statistic1[i], 2);
			divisor2 += Math.pow(statistic2[i], 2);
		}

		return dividend / (Math.sqrt(divisor1) * Math.sqrt(divisor2));
	}
	
	private static int[] statistic(List<String> allWords, List<String> sentWords) {
		int[] result = new int[allWords.size()];
		for (int i = 0; i < allWords.size(); i++) {
			result[i] = Collections.frequency(sentWords, allWords.get(i));
		}
		return result;
	}
	
	/**
	 * 
	 * @title mergeList
	 * @Description 合并分词结果，列出所有的词
	 * @param list1
	 * @param list2
	 * @return
	 */
	private static List<String> mergeList(List<String> list1, List<String> list2) {
		List<String> result = new ArrayList<>();
		result.addAll(list1);
		result.addAll(list2);
		return result.stream().distinct().collect(Collectors.toList());
	}
}