配置pom文件
1 2 3 4 5
| <dependency> <groupId>com.hankcs</groupId> <artifactId>hanlp</artifactId> <version>portable-1.6.4</version> </dependency>
|
当前最新版本为1.6.4。可以在这里查看最新版本。
添加自定义字典文件
在resources下面建立目录,存放自定义字典。
我的目录是resources/hanlp/custom/CustomDic.txt
编写Java类
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
| public class WordUtil {
public static final String CUSTOM_DICTIONARY_PATH = "/hanlp/custom/CustomDic.txt"; public static final String SYNONYM_WORD_PATH = "/hanlp/synonym/SynonDic.txt";
private static List<Term> list; private static Segment segment; private static Map synonymMap;
public static void main(String[] args) {
segmentSentence("你是柴犬狗狗吧"); }
public static void refreshDictionary() { HanLP.Config.CustomDictionaryPath = new String[] { WordUtil.class.getResource(CUSTOM_DICTIONARY_PATH).getPath() };
CustomDictionary.reload(); segment = HanLP.newSegment().enableCustomDictionary(true); }
public static String segmentSentence(String text) { segment = HanLP.newSegment().enableCustomDictionary(true);
list = segment.seg(text); String result = list.get(0).word; for(int i =1; i < list.size(); i++) { result+=" "+list.get(i).word; } System.out.println(result); return result; }
public static void refreshSynonym() { String path = WordUtil.class.getResource(SYNONYM_WORD_PATH).getPath(); File file = new File(path); BufferedReader reader; synonymMap = new HashMap();
try { reader = new BufferedReader(new FileReader(file)); String line; while ((line = reader.readLine()) != null) { String[] words = line.split(" "); for (String word: words) { synonymMap.put(word, words[0]); } } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); }
}
public static String getSynonym(String word) { return (String) synonymMap.get(word); }
}
|
为了提高效率,HanLP会将字典文件转义成bin文件,每次从bin文件中读取单词。
所以当字典更新时,需要删除bin文件,重新生成,官方提供reload方法,但是需要指定自定义字典的位置。
当调用segment()方法时会先从对应的bin文件中读取,如果没有bin文件,先重新生成bin文件,再读取。