1、在日常工作中,有必要找到类似的汉字。互联网上已经有了根据四角码和结构找到类似文字的解决方案。但是我的需求相对简单,所以我开始使用它java基于余弦相似度做了一个小测试,发现效果还可以。
二、具体代码
import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.builder.CompareToBuilder; import org.junit.Test; import javax.imageio.ImageIO; import java.awt.*; import java.awt.image.BufferedImage; import java.io.File; import java.io.IOException; import java.util.*; import java.util.List; public class HanziSimilarTest { private String outPath = "e:/tmp/heiti-bmp/"; private static final String words="涅槃乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园,浣熊乐园,浣熊乐园,浣熊乐园,浣熊乐园,浣熊乐园,浣熊乐园,鸵鸟乐园,鸵鸟乐园,鸵鸟乐园"; /** * 基于value 排序 * @param map * @param isDesc * @param <K> * @param <V> * @return */ public static <K, V extends Comparable<? super V>> Map<K, V> mapSortByValue(Map<K, V> map, final boolean isDesc) { List<Map.Entry<K, V>> list = new LinkedList<Map.Entry<K, V>>(map.entrySet()); Collections.sort(list, new Comparator<Map.Entry<K, V>>() { @Override public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2){
CompareToBuilder compareToBuilder = new CompareToBuilder();
if(isDesc){
compareToBuilder.append(o2.getValue(),o1.getValue());
}else{
compareToBuilder.append(o1.getValue(),o2.getValue());
}
compareToBuilder.append(o1.getKey().toString(),o2.getKey().toString());
return compareToBuilder.toComparison();
}
});
Map<K, V> result = new LinkedHashMap<K, V>();
for (Map.Entry<K, V> entry : list) {
result.put(entry.getKey(), entry.getValue());
}
return result;
}
@Test
public void genBmp(){
File outfile = new File(outPath);
if(!outfile.exists()){
outfile.mkdirs();
}
for(char key:words.toCharArray()){
genWordBmp(key);
}
}
/**
*余弦相似度
* @param vectorA
* @param vectorB
* @return
*/
public static double cosineSimilarity(byte[] vectorA, byte[] vectorB) {
double dotProduct = 0.0;
double normA = 0.0;
double normB = 0.0;
for (int i = 0; i < vectorA.length; i++) {
dotProduct += vectorA[i] * vectorB[i];
normA += Math.pow(vectorA[i], 2);
normB += Math.pow(vectorB[i], 2);
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
/**
* 将文字生成图片
* @param word
*/
private void genWordBmp(char word){
Font font = new Font("黑体", Font.PLAIN, 48);
BufferedImage img = new BufferedImage(50, 50, BufferedImage.TYPE_BYTE_GRAY);
Graphics2D g2d = img.createGraphics();
g2d.setFont(font);
FontMetrics fm = g2d.getFontMetrics();
g2d.setBackground(Color.WHITE);
g2d.fillRect(0, 0, 50, 50 );
g2d.setColor(Color.BLACK);
//g2d.setPaint(Color.BLACK);
g2d.drawString(String.valueOf(word), 0, fm.getAscent());
g2d.dispose();
try {
ImageIO.write(img, "bmp", new File(outPath +word+".bmp"));
} catch (IOException ex) {
ex.printStackTrace();
}
}
@Test
public void findSimilarChar() throws Exception{
String first="国";
byte[] b1= FileUtils.readFileToByteArray(new File(outPath + first + ".bmp"));
Map<String,Double> result = new HashMap<>(8);
for(char key:words.toCharArray()){
byte[] b2=FileUtils.readFileToByteArray(new File(outPath + key + ".bmp"));
result.put(String.valueOf(key),cosineSimilarity(b1,b2));
}
Map<String,Double> orderedMap = mapSortByValue(result, true);
int count = 0;
int topn=10;
for (Map.Entry<String,Double> item : orderedMap.entrySet()) {
if (count++ > topn) {
break;
}
System.out.println(item.getKey() + "," +item.getValue());
}
}
}
三、输入国字得到的结果:
四、在生成图片时使用了位图bmp格式,主要目的是让每个图片的字节数相同,方便比较。
五、需要使用的读者最好是将常用汉字一次性生成形似字字典,之后使用时直接读取字典。