概述
package com.miaodi.api.util;
public class SimilarityRatioUtils {
/**
* 莱茵斯坦距离
*
* @param str
* @param target
* @return
*/
public static int levenshiteinDistance(String str, String target) {
int s = str.length();
int t = target.length();
if (s == 0) {
return t;
}
if (t == 0) {
return s;
}
int d[][];
int si;
int tj;
char strChar;
char tagetChar;
int temp; // 记录相同字符,在某个矩阵位置值的增量,不是0就是1
d = new int[s + 1][t + 1];
for (si = 0; si <= s; si++) {// 初始化第一列
d[si][0] = si;
}
for (si = 1; si <= s; si++) {// 遍历str
strChar = str.charAt(si - 1);
for (tj = 1; tj <= t; tj++) {
tagetChar = target.charAt(tj - 1);
/*
* 忽略大小写 if (strChar == tagetChar || strChar == tagetChar + 32 || strChar + 32
* == tagetChar) {
*/
if (strChar == tagetChar) {
temp = 0;
} else {
temp = 1;
}
d[si][tj] = min(d[si - 1][tj] + 1, d[si][tj - 1] + 1, d[si - 1][tj - 1] + temp);
}
}
return d[s][t];
}
private static int min(int one, int two, int three) {
return (one = one < two ? one : two) < three ? one : three;
}
/**
* 获取两字符串的相似度
*/
public static float getSimilarityRatio(String str, String target) {
// 1-莱茵斯坦距离/最大字符串长度
return 1 - (float) levenshiteinDistance(str, target) / Math.max(str.length(), target.length());
}
}
package com.miaodi.api.handler;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
/**
* 模板提取
* @author Administrator
*
*/
public class TemplateDistill implements Callable<String>{
public TemplateDistill(String source, String target, List<Future<String>> futures, ExecutorService pool, int startSourceIDX,int startTargetIDX, boolean lastIsPlaceholder,String templatePrefix,int variableNum) {
super();
this.source = source;
this.target = target;
this.futures = futures;
this.pool = pool;
this.startSourceIDX = startSourceIDX;
this.startTargetIDX = startTargetIDX;
this.lastIsPlaceholder = lastIsPlaceholder;
this.templatePrefix = templatePrefix == null ? "":templatePrefix;
this.variableNum = variableNum;
}
private String templatePrefix;
private final int maxVariableNum = 3;
private int variableNum;
private int startSourceIDX;
private int startTargetIDX;
private boolean lastIsPlaceholder;
private final String source;
private final String target;
private final List<Future<String>> futures;
private final ExecutorService pool;
@SuppressWarnings("unchecked")
public String call() throws Exception {
StringBuilder template = new StringBuilder(templatePrefix);
while (true) {
// 获取最大连击数
Map<String, Object> result = carom(source, target, startSourceIDX, startTargetIDX);
int maxCaromNum = (Integer) result.get("caromNum");
if (maxCaromNum == 0) {
if(!lastIsPlaceholder) {
variableNum += 1;
if (variableNum > maxVariableNum) {
return null;
}
template.append("|");
lastIsPlaceholder = true;
}
if (result.get("otherTargetIndexs") != null) {
// 开启多线程找模板
List<Integer> otherTargetIndexs = (List<Integer>) result.get("otherTargetIndexs");
for (Integer otherTargetIndex : otherTargetIndexs) {
futures.add(pool.submit(new TemplateDistill(source, target, futures, pool, startSourceIDX, otherTargetIndex, lastIsPlaceholder,template.toString(), variableNum)));
}
}
startSourceIDX += 1;
}else{
template.append(source.substring(startSourceIDX, startSourceIDX + maxCaromNum));
startSourceIDX += maxCaromNum;
startTargetIDX += maxCaromNum;
lastIsPlaceholder = false;;
}
if (source.length() <= startSourceIDX) {
if (!lastIsPlaceholder && target.length() > startTargetIDX) {
variableNum += 1;
if (variableNum > maxVariableNum) {
return null;
}
template.append("|");
}
break;
}
if (target.length() <= startTargetIDX) {
if(!lastIsPlaceholder) {
variableNum += 1;
if (variableNum > maxVariableNum) {
return null;
}
template.append("|");
}
break;
}
}
return template.toString();
}
/**
* 获取连击数
* @param source 字符串s
* @param target 字符串t
* @param startSourceIDX 字符串s从什么下标开始匹配
* @param startTargetIDX 字符串t从什么下标开始匹配
* @return
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
private static Map<String,Object> carom(String source,String target,Integer startSourceIDX,Integer startTargetIDX){
// 连击次数
int attemptCaromNum = 0;
while(true) {
// 每次累加1,最后减1
attemptCaromNum += 1;
// 本次匹配的子串的结束下标
int endSourceIDX = startSourceIDX + attemptCaromNum;
if (endSourceIDX > source.length()) {
break;
}
// 本次匹配的子串
String attemptStr = source.substring(startSourceIDX, endSourceIDX);
int idx = target.indexOf(attemptStr, startTargetIDX);
// 匹配失败
if (idx != startTargetIDX) {
break;
}
}
attemptCaromNum -= 1;
// 结果
Map<String, Object> result = new HashMap<String, Object>();
result.put("caromNum", attemptCaromNum);
// 匹配失败时
if (attemptCaromNum == 0) {
// 单纯的一个字符
int endStrIDX = startSourceIDX + 1;
if (endStrIDX <= source.length()) {
// 匹配他失败后的字符
List<Integer> otherTargetIndexs = new ArrayList();
char c = source.charAt(startSourceIDX);
char[] cs = target.substring(startTargetIDX+1).toCharArray();
for (int i = 0; i < cs.length; i++) {
if (c == cs[i]) {
otherTargetIndexs.add(startTargetIDX+1+i);
}
}
result.put("otherTargetIndexs", otherTargetIndexs.isEmpty() ? null : otherTargetIndexs);
}
}
return result;
}
}
package com.miaodi.api.handler;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Vector;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import com.miaodi.api.util.SimilarityRatioUtils;
public class TemplateDistillUtil {
private static final ExecutorService executorPool = new ThreadPoolExecutor(10, 30, 30, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(50),new ThreadPoolExecutor.CallerRunsPolicy());
public static String distillTemplates(List<String> targets){
if (targets.isEmpty()) {
return null;
}else if(targets.size() == 1) {
return targets.get(0);
}
String source = targets.get(0);
String template = null;
for (int i = 1; i < targets.size(); i++) {
if (i==1) {
template = distillTemplates(source, targets.get(i));
}else {
if (template == null) {
return null;
}
template = distillTemplates(template, targets.get(i));
}
}
return format(template == null ? "|": template);
}
public static String distillTemplates(String source, String target) {
// 符合这两字符串的全部模板
List<Future<String>> futures = new Vector<Future<String>>();
futures.add(executorPool.submit(new TemplateDistill(source,target,futures,executorPool,0,0,false,null,0)));
Set<String> templates = new HashSet<String>();
while (true) {
int y = 0;
for (int i = 0; i < futures.size(); i++) {
try {
y += 1;
String template = futures.get(i).get();
if (template != null) {
templates.add(futures.get(i).get());
}
} catch (Exception e) {
e.printStackTrace();
}
}
if (y == futures.size()) {
break;
}
}
float maxRatio = 0;
String maxRatioTemplate = null;
for (String template : templates) {
float ratio = SimilarityRatioUtils.getSimilarityRatio(source, template);
if (ratio > maxRatio) {
maxRatio = ratio;
maxRatioTemplate = template;
}
}
return maxRatioTemplate;
}
public static String format(String template) {
if (template == null) {
return null;
}
StringBuilder sb = new StringBuilder();
String[] vs = template.split("\|");
for (int i = 1; i <= vs.length; i++) {
sb.append(vs[i-1]);
if (i != vs.length) {
sb.append("{"+i+"}");
}
}
if (template.lastIndexOf("|") == template.length()-1) {
sb.append("{"+((vs.length-1)<0?0:(vs.length-1))+"}");
}
return sb.toString();
}
// public static void main(String[] args) {
//
//
System.out.println(format("123|123|"));
//
//
long start = System.currentTimeMillis();
//
//
String[] strs = new String[] {
"廖兴攀-有线,在2017-10-01 11:53:22",
"鲍冬梅-有线,在2017-10-01 11:53:11"
"【兴业银行】尊敬的用户,于10月8日前点 https://rsp.mobi/N3eOs4 申请信用卡,可获批10万额度。退订回T",
"【兴业银行】尊敬的用户,于10月8日前点 https://rsp.mobi/A5jk6Z 申请信用卡,可获批10万额度。退订回T"
"【随你花】尊敬的王建辉,您的账单现在到期了,逾期将取消再借资格,请及时处理!需要链接找客服。回T退订",
"【随你花】尊敬的项俊,您的账单现在到期了,逾期将取消再借资格,请及时处理!需要链接找客服。回T退订",
"廖兴攀-有线,在2017-10-01 11:53:37触发了上线提醒,请留意。",
"鲍冬梅-000有线,在2017-10-01 11:53:45触发了上线提醒,请留意。",
//
"123xxx12",
//
"123xxz12",
//
"123xxz221211",
,"【银汉科技】尊贵的易美达分期客户:郭文娟,你总12期的第2期1866.67元,2017-12-31为本期还款日,到目前未还款,现已经逾期,请珍视个人信用,维护良好的账户记录和信誉。如有疑问,请致电4000-512-012,退订回T。"
//
};
System.out.println(SimilarityRatioUtils.getSimilarityRatio(strs[0], strs[1]));
//
String templates = distillTemplates(strs);
float maxRatio = 0;
String maxRatioTemplate = null;
for (String template : templates) {
float ratio = SimilarityRatioUtils.getSimilarityRatio(strs[0], template);
if (ratio > maxRatio) {
maxRatio = ratio;
maxRatioTemplate = template;
}
}
//
System.out.println("模板:"+templates);
Matcher matcher = Pattern.compile("\{\d+\}").matcher(maxRatioTemplate);
String templete = matcher.replaceAll(".*");
String regex = String.format("^%s$", templete); // 构建新的pattern字符串
for (String string : strs) {
System.out.println(Pattern.compile(regex).matcher(string).find());
}
//
System.out.println("共计耗时:"+ (System.currentTimeMillis()-start) +"毫秒");
// }
}
最后
以上就是糊涂故事为你收集整理的java 字符串相似度算法的全部内容,希望文章能够帮你解决java 字符串相似度算法所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复