packagefst;
importjava.io.File;
importjava.io.FileInputStream;
importjava.io.IOException;
importjava.io.StringReader;
importjava.util.ArrayList;
importjava.util.HashMap;
importjava.util.List;
importjava.util.Map;
importorg.apache.lucene.analysis.TokenStream;
importorg.apache.lucene.analysis.core.WhitespaceTokenizer;
importorg.apache.lucene.analysis.synonym.SynonymFilterFactory;
importorg.apache.lucene.analysis.tokenattributes.CharTermAttribute;
importorg.apache.lucene.analysis.util.FilesystemResourceLoader;
importorg.apache.lucene.store.DataInput;
importorg.apache.lucene.store.InputStreamDataInput;
importorg.apache.lucene.util.BytesRef;
importorg.apache.lucene.util.CharsRef;
importorg.apache.lucene.util.IntsRef;
importorg.apache.lucene.util.NumericUtils;
importorg.apache.lucene.util.Version;
importorg.apache.lucene.util.fst.Builder;
importorg.apache.lucene.util.fst.ByteSequenceOutputs;
importorg.apache.lucene.util.fst.FST;
importorg.apache.lucene.util.fst.Util;
classFSTDic{
FSTfst;
FST.BytesReaderfstReader;
publicFSTDic()throwsIOException{
Filefile=newFile("fst");
if(file.exists()){
fst=load(file);
}else{
Listwords=newArrayList();
words.add("中国");
words.add("中国人");
words.add("中国人民");
words.add("中国人民解放军");
fst=build(words);
}
fstReader=fst.getBytesReader();
}
publicvoidsave()throwsIOException{
fst.save(newFile("fst"));
}
publicFSTload(Filefile)throwsIOException{
returnnewFST(newInputStreamDataInput(newFileInputStream("fst")),ByteSequenceOutputs.getSingleton());
}
privateFSTbuild(Listwords)throwsIOException{
ByteSequenceOutputsoutputs=ByteSequenceOutputs.getSingleton();
Builderbuilder=newBuilder(FST.INPUT_TYPE.BYTE4,outputs);
finalIntsRefscratchIntsRef=newIntsRef();
BytesRefoutput=newBytesRef(4);
for(Stringword:words){
NumericUtils.intToPrefixCodedBytes(word.length(),0,output);
builder.add(Util.toUTF32(word,scratchIntsRef),BytesRef.deepCopyOf(output));
}
returnbuilder.finish();
}
publicbooleancontains(Stringword)throwsIOException{
FST.ArcscratchArc=newFST.Arc();
intbufUpto=0,buflen=word.length();
BytesRefpendingOutput=fst.outputs.getNoOutput();
BytesRefmatchOutput=null;
fst.getFirstArc(scratchArc);
while(bufUpto
intcodePoint=Character.codePointAt(word,bufUpto);
if(fst.findTargetArc(codePoint,scratchArc,scratchArc,fstReader)!=null){
pendingOutput=fst.outputs.add(pendingOutput,scratchArc.output);
}else{
break;
}
bufUpto =Character.charCount(codePoint);
}
if(scratchArc.isFinal()){
matchOutput=fst.outputs.add(pendingOutput,scratchArc.nextFinalOutput);
intlen=NumericUtils.prefixCodedToInt(matchOutput);
System.out.println(len);
returntrue;
}
returnfalse;
}
publicstaticvoidmain(String[]args)throwsIOException{
FSTDicdic=newFSTDic();
//dic.save();
System.out.println(dic.contains("中国"));
System.out.println(dic.contains("中国人"));
System.out.println(dic.contains("中国人民"));
System.out.println(dic.contains("中国人民解放军"));
}
}
(随记,稍后补齐……)