作者:8prye孙瑞D | 来源:互联网 | 2023-05-19 08:27
抓取的网页内容中,有大部分会是相似的,抓取时就要过滤掉,开始考虑用VSM算法,后来发现不对,要比较太多东西了,然后就发现了simHash算法,这个算法的解释我就懒得copy了,simhash算法对
抓取的网页内容中,有大部分会是相似的,抓取时就要过滤掉,开始考虑用VSM算法,后来发现不对,要比较太多东西了,然后就发现了simHash算法,这个算法的解释我就懒得copy了,simhash算法对于短数据的支持不好,但是,我本来就是很长的数据,用上!
源码实现网上也有不少,但是貌似都是同样的,里面写得不清不楚的,虽然效果基本能达到,但是不清楚的东西,我用来做啥?
仔细研究simhash算法的说明后,把里面字符串的hash算法换成的fvn-1算法,这个在http://www.isthe.com/chongo/tech/comp/fnv/里面有说明了,具体的那些固定数值,网站上都写了。原先代码里面有些处理,和算法不符的,也换掉了。
首先搞起IKAnalyzer,切词并计算每个词的频率:
package com.cnblogs.zxub.lucene.similarity;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class WordsSpliter {
public static Map getSplitedWords(String str)
throws IOException {
// str = str.replaceAll("[0-9a-zA-Z]", "");
Analyzer analyzer = new IKAnalyzer();
Reader r = new StringReader(str);
TokenStream ts = analyzer.tokenStream("searchValue", r);
ts.addAttribute(CharTermAttribute.class);
Map result = new HashMap();
while (ts.incrementToken()) {
CharTermAttribute ta = ts.getAttribute(CharTermAttribute.class);
String word = ta.toString();
if (!result.containsKey(word)) {
result.put(word, 0);
}
result.put(word, result.get(word) + 1);
}
return result;
}
}
然后把SimHash的算法搞上:
package com.cnblogs.zxub.lucene.similarity;
import java.io.IOException;
import java.math.BigInteger;
import java.util.Map;
import java.util.Set;
public class SimHash {
private static final int HASH_BITS = 64;
private static final BigInteger FNV_64_INIT = new BigInteger(
"14695981039346656037");
private static final BigInteger FNV_64_PRIME = new BigInteger(
"1099511628211");
private static final BigInteger MASK_64 = BigInteger.ONE.shiftLeft(
HASH_BITS).subtract(BigInteger.ONE);
private String hash;
private BigInteger signature;
public SimHash(String content) throws IOException {
super();
this.analysis(content);
}
public String getHash() {
return this.hash;
}
public BigInteger getSignature() {
return this.signature;
}
private void analysis(String content) throws IOException {
Map wordInfos = WordsSpliter.getSplitedWords(content);
int[] featureVector = new int[SimHash.HASH_BITS];
Set words = wordInfos.keySet();
for (String word : words) {
BigInteger wordhash = this.fnv1_64_hash(word);
for (int i = 0; i ) {
BigInteger bitmask
=
BigInteger.ONE.shiftLeft(SimHash.HASH_BITS
- i - 1
);
if (wordhash.and(bitmask).signum() != 0
) {
featureVector[i] +=
wordInfos.get(word);
} else {
featureVector[i] -=
wordInfos.get(word);
}
}
}
BigInteger signature =
BigInteger.ZERO;
StringBuffer hashBuffer =
new StringBuffer();
for (
int i = 0; i
) {
if (featureVector[i] >= 0) {
signature = signature.add(BigInteger.ONE
.shiftLeft(SimHash.HASH_BITS - i - 1));
hashBuffer.append("1");
} else {
hashBuffer.append("0");
}
}
this.hash = hashBuffer.toString();
this.signature = signature;
}
// fnv-1 hash算法,将字符串转换为64位hash值
private BigInteger fnv1_64_hash(String str) {
BigInteger hash = FNV_64_INIT;
int len = str.length();
for (int i = 0; i ) {
hash = hash.multiply(FNV_64_PRIME);
hash = hash.xor(BigInteger.valueOf(str.charAt(i)));
}
hash = hash.and(MASK_64);
return hash;
}
public int getHammingDistance(BigInteger targetSignature) {
BigInteger x = this.getSignature().xor(targetSignature);
String s = x.toString(2);
return s.replaceAll("0", "").length();
}
public int getHashDistance(String targetHash) {
int distance;
if (this.getHash().length() != targetHash.length()) {
distance = -1;
} else {
distance = 0;
for (int i = 0; i <this.getHash().length(); i++) {
if (this.getHash().charAt(i) != targetHash.charAt(i)) {
distance++;
}
}
}
return distance;
}
}
数据库里面存个签名就好了,至于距离运算,本打算全部拉出来计算,后来发现oracle的bitand函数,就用它了!异或之后,转二进制字符串,把0去掉,取长度,再count一下长度小于4的,得到的结果就是很相似的内容数目了。以后再把计算改成用缓存的去,先偷个懒。
oracle函数部分贴上(注意Oracle的length函数永远不会返回0,最后要用个nvl函数,还有就是bitand在数值太大的时候,会溢出导致结果失误,所以要用utl_raw.bit_and,后面两个函数中字符串还不能用64位,改成128位搞定,估计还能小点,不弄了):
create or replace function bitxor(a in number,b in number) return number
is
begin
return return a+b-2*to_number(utl_raw.bit_and(to_char(a),to_char(b)));
end;
create or replace function dec2bit(v_num number) return varchar
is
v_rtn varchar(128);
v_n1 number;
v_n2 number;
begin
v_n1 := v_num;
loop
v_n2 := mod(v_n1, 2);
v_n1 := trunc(v_n1 / 2);
v_rtn := to_char(v_n2) || v_rtn;
exit when v_n1 = 0;
end loop;
return v_rtn;
end;
create or replace function hm_distance(a in number,b in number) return number
is
v_dis number;
v_xor number;
v_bit varchar(128);
begin
v_xor:=bitxor(a,b);
v_bit:=dec2bit(v_xor);
v_dis:=length(replace(v_bit,'0',''));
return nvl(v_dis,0);
end;
跑一下 select hm_distance(1108937774045716955,1108937774045721051) from dual ,结果为1,o了。
后面去用了下,发现fnv1居然正好撞到一个神奇的万金油,改成fnv1a就好了,代码就不改了。。。