Search in sources :

Example 1 with Synonym

use of com.hankcs.hanlp.corpus.synonym.Synonym in project HanLP by hankcs.

the class CommonSynonymDictionary method load.

public boolean load(InputStream inputStream) {
    trie = new DoubleArrayTrie<SynonymItem>();
    TreeMap<String, SynonymItem> treeMap = new TreeMap<String, SynonymItem>();
    String line = null;
    try {
        BufferedReader bw = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
        ArrayList<Synonym> synonymList = null;
        while ((line = bw.readLine()) != null) {
            String[] args = line.split(" ");
            synonymList = Synonym.create(args);
            char type = args[0].charAt(args[0].length() - 1);
            for (Synonym synonym : synonymList) {
                treeMap.put(synonym.realWord, new SynonymItem(synonym, synonymList, type));
            // 这里稍微做个test
            //assert synonym.getIdString().startsWith(line.split(" ")[0].substring(0, line.split(" ")[0].length() - 1)) : "词典有问题" + line + synonym.toString();
            }
        }
        bw.close();
        // 获取最大语义id
        if (synonymList != null && synonymList.size() > 0) {
            maxSynonymItemIdDistance = synonymList.get(synonymList.size() - 1).id - SynonymHelper.convertString2IdWithIndex("Aa01A01", 0) + 1;
        }
        int resultCode = trie.build(treeMap);
        if (resultCode != 0) {
            logger.warning("构建" + inputStream + "失败,错误码" + resultCode);
            return false;
        }
    } catch (Exception e) {
        logger.warning("读取" + inputStream + "失败,可能由行" + line + "造成");
        return false;
    }
    return true;
}
Also used : InputStreamReader(java.io.InputStreamReader) TreeMap(java.util.TreeMap) BufferedReader(java.io.BufferedReader) Synonym(com.hankcs.hanlp.corpus.synonym.Synonym)

Example 2 with Synonym

use of com.hankcs.hanlp.corpus.synonym.Synonym in project HanLP by hankcs.

the class CommonSynonymDictionaryEx method load.

public boolean load(InputStream inputStream) {
    trie = new DoubleArrayTrie<Long[]>();
    TreeMap<String, Set<Long>> treeMap = new TreeMap<String, Set<Long>>();
    String line = null;
    try {
        BufferedReader bw = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
        while ((line = bw.readLine()) != null) {
            String[] args = line.split(" ");
            List<Synonym> synonymList = Synonym.create(args);
            for (Synonym synonym : synonymList) {
                Set<Long> idSet = treeMap.get(synonym.realWord);
                if (idSet == null) {
                    idSet = new TreeSet<Long>();
                    treeMap.put(synonym.realWord, idSet);
                }
                idSet.add(synonym.id);
            }
        }
        bw.close();
        List<String> keyList = new ArrayList<String>(treeMap.size());
        for (String key : treeMap.keySet()) {
            keyList.add(key);
        }
        List<Long[]> valueList = new ArrayList<Long[]>(treeMap.size());
        for (Set<Long> idSet : treeMap.values()) {
            valueList.add(idSet.toArray(new Long[0]));
        }
        int resultCode = trie.build(keyList, valueList);
        if (resultCode != 0) {
            logger.warning("构建" + inputStream + "失败,错误码" + resultCode);
            return false;
        }
    } catch (Exception e) {
        logger.warning("读取" + inputStream + "失败,可能由行" + line + "造成" + e);
        return false;
    }
    return true;
}
Also used : InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) Synonym(com.hankcs.hanlp.corpus.synonym.Synonym)

Example 3 with Synonym

use of com.hankcs.hanlp.corpus.synonym.Synonym in project HanLP by hankcs.

the class CommonSynonymDictionary method rewrite.

public String rewrite(String text) {
    List<Term> termList = StandardTokenizer.segment(text.toCharArray());
    StringBuilder sbOut = new StringBuilder((int) (text.length() * 1.2));
    String preWord = Predefine.TAG_BIGIN;
    for (Term term : termList) {
        SynonymItem synonymItem = get(term.word);
        Synonym synonym;
        if (synonymItem != null && (synonym = synonymItem.randomSynonym(Type.EQUAL, preWord)) != null) {
            sbOut.append(synonym.realWord);
        } else
            sbOut.append(term.word);
        preWord = PosTagCompiler.compile(term.nature.toString(), term.word);
    }
    return sbOut.toString();
}
Also used : Synonym(com.hankcs.hanlp.corpus.synonym.Synonym) Term(com.hankcs.hanlp.seg.common.Term)

Example 4 with Synonym

use of com.hankcs.hanlp.corpus.synonym.Synonym in project HanLP by hankcs.

the class CommonSynonymDictionary method rewriteQuickly.

public String rewriteQuickly(String text) {
    assert text != null;
    StringBuilder sbOut = new StringBuilder((int) (text.length() * 1.2));
    String preWord = Predefine.TAG_BIGIN;
    for (int i = 0; i < text.length(); ++i) {
        int state = 1;
        state = trie.transition(text.charAt(i), state);
        if (state > 0) {
            int start = i;
            int to = i + 1;
            int end = -1;
            SynonymItem value = null;
            for (; to < text.length(); ++to) {
                state = trie.transition(text.charAt(to), state);
                if (state < 0)
                    break;
                SynonymItem output = trie.output(state);
                if (output != null) {
                    value = output;
                    end = to + 1;
                }
            }
            if (value != null) {
                Synonym synonym = value.randomSynonym(Type.EQUAL, preWord);
                if (synonym != null) {
                    sbOut.append(synonym.realWord);
                    preWord = synonym.realWord;
                } else {
                    preWord = text.substring(start, end);
                    sbOut.append(preWord);
                }
                i = end - 1;
            } else {
                preWord = String.valueOf(text.charAt(i));
                sbOut.append(text.charAt(i));
            }
        } else {
            preWord = String.valueOf(text.charAt(i));
            sbOut.append(text.charAt(i));
        }
    }
    return sbOut.toString();
}
Also used : Synonym(com.hankcs.hanlp.corpus.synonym.Synonym)

Aggregations

Synonym (com.hankcs.hanlp.corpus.synonym.Synonym)4 BufferedReader (java.io.BufferedReader)2 InputStreamReader (java.io.InputStreamReader)2 Term (com.hankcs.hanlp.seg.common.Term)1 TreeMap (java.util.TreeMap)1