Search in sources :

Example 36 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class TestSegment method testIssue193.

public void testIssue193() throws Exception {
    String[] testCase = new String[] { "以每台约200元的价格送到苹果售后维修中心换新机(苹果的保修基本是免费换新机)", "可能以2500~2800元的价格回收", "3700个益农信息社打通服务“最后一公里”", "一位李先生给高政留言说上周五可以帮忙献血", "一位浩宁达高层透露", "五和万科长阳天地5个普宅项目", "以1974点低点和5178点高点作江恩角度线", "纳入统计的18家京系基金公司", "华夏基金与嘉实基金两家京系基金公司", "则应从排名第八的投标人开始依次递补三名投标人" };
    Segment segment = HanLP.newSegment().enableOrganizationRecognize(true).enableNumberQuantifierRecognize(true);
    for (String sentence : testCase) {
        List<Term> termList = segment.seg(sentence);
        System.out.println(termList);
    }
}
Also used : Term(com.hankcs.hanlp.seg.common.Term) ResultTerm(com.hankcs.hanlp.seg.common.ResultTerm) Segment(com.hankcs.hanlp.seg.Segment) DoubleArrayTrieSegment(com.hankcs.hanlp.seg.Other.DoubleArrayTrieSegment) CRFSegment(com.hankcs.hanlp.seg.CRF.CRFSegment) DijkstraSegment(com.hankcs.hanlp.seg.Dijkstra.DijkstraSegment) ViterbiSegment(com.hankcs.hanlp.seg.Viterbi.ViterbiSegment)

Example 37 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class TextRankKeyword method getRank.

/**
     * 使用已经分好的词来计算rank
     * @param termList
     * @return
     */
public Map<String, Float> getRank(List<Term> termList) {
    List<String> wordList = new ArrayList<String>(termList.size());
    for (Term t : termList) {
        if (shouldInclude(t)) {
            wordList.add(t.word);
        }
    }
    //        System.out.println(wordList);
    Map<String, Set<String>> words = new TreeMap<String, Set<String>>();
    Queue<String> que = new LinkedList<String>();
    for (String w : wordList) {
        if (!words.containsKey(w)) {
            words.put(w, new TreeSet<String>());
        }
        que.offer(w);
        if (que.size() > 5) {
            que.poll();
        }
        for (String w1 : que) {
            for (String w2 : que) {
                if (w1.equals(w2)) {
                    continue;
                }
                words.get(w1).add(w2);
                words.get(w2).add(w1);
            }
        }
    }
    //        System.out.println(words);
    Map<String, Float> score = new HashMap<String, Float>();
    for (int i = 0; i < max_iter; ++i) {
        Map<String, Float> m = new HashMap<String, Float>();
        float max_diff = 0;
        for (Map.Entry<String, Set<String>> entry : words.entrySet()) {
            String key = entry.getKey();
            Set<String> value = entry.getValue();
            m.put(key, 1 - d);
            for (String element : value) {
                int size = words.get(element).size();
                if (key.equals(element) || size == 0)
                    continue;
                m.put(key, m.get(key) + d / size * (score.get(element) == null ? 0 : score.get(element)));
            }
            max_diff = Math.max(max_diff, Math.abs(m.get(key) - (score.get(key) == null ? 0 : score.get(key))));
        }
        score = m;
        if (max_diff <= min_diff)
            break;
    }
    return score;
}
Also used : Term(com.hankcs.hanlp.seg.common.Term)

Example 38 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class DoubleArrayTrieSegment method segSentence.

@Override
protected List<Term> segSentence(char[] sentence) {
    char[] charArray = sentence;
    final int[] wordNet = new int[charArray.length];
    Arrays.fill(wordNet, 1);
    final Nature[] natureArray = config.speechTagging ? new Nature[charArray.length] : null;
    DoubleArrayTrie<CoreDictionary.Attribute>.Searcher<CoreDictionary.Attribute> searcher = CoreDictionary.trie.getSearcher(sentence, 0);
    while (searcher.next()) {
        int length = searcher.length;
        if (length > wordNet[searcher.begin]) {
            wordNet[searcher.begin] = length;
            if (config.speechTagging) {
                natureArray[searcher.begin] = searcher.value.nature[0];
            }
        }
    }
    if (config.useCustomDictionary) {
        CustomDictionary.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<CoreDictionary.Attribute>() {

            @Override
            public void hit(int begin, int end, CoreDictionary.Attribute value) {
                int length = end - begin;
                if (length > wordNet[begin]) {
                    wordNet[begin] = length;
                    if (config.speechTagging) {
                        natureArray[begin] = value.nature[0];
                    }
                }
            }
        });
    }
    LinkedList<Term> termList = new LinkedList<Term>();
    if (config.speechTagging) {
        for (int i = 0; i < natureArray.length; ) {
            if (natureArray[i] == null) {
                int j = i + 1;
                for (; j < natureArray.length; ++j) {
                    if (natureArray[j] != null)
                        break;
                }
                List<AtomNode> atomNodeList = quickAtomSegment(charArray, i, j);
                for (AtomNode atomNode : atomNodeList) {
                    if (atomNode.sWord.length() >= wordNet[i]) {
                        wordNet[i] = atomNode.sWord.length();
                        natureArray[i] = atomNode.getNature();
                        i += wordNet[i];
                    }
                }
                i = j;
            } else {
                ++i;
            }
        }
    }
    for (int i = 0; i < wordNet.length; ) {
        Term term = new Term(new String(charArray, i, wordNet[i]), config.speechTagging ? (natureArray[i] == null ? Nature.nz : natureArray[i]) : null);
        term.offset = i;
        termList.add(term);
        i += wordNet[i];
    }
    return termList;
}
Also used : Nature(com.hankcs.hanlp.corpus.tag.Nature) Term(com.hankcs.hanlp.seg.common.Term) AhoCorasickDoubleArrayTrie(com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie) DoubleArrayTrie(com.hankcs.hanlp.collection.trie.DoubleArrayTrie) LinkedList(java.util.LinkedList) AhoCorasickDoubleArrayTrie(com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie) CoreDictionary(com.hankcs.hanlp.dictionary.CoreDictionary) AtomNode(com.hankcs.hanlp.seg.NShort.Path.AtomNode)

Example 39 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class CoreSynonymDictionaryEx method convert.

/**
     * 将分词结果转换为同义词列表
     * @param sentence 句子
     * @param withUndefinedItem 是否保留词典中没有的词语
     * @return
     */
public static List<Long[]> convert(List<Term> sentence, boolean withUndefinedItem) {
    List<Long[]> synonymItemList = new ArrayList<Long[]>(sentence.size());
    for (Term term : sentence) {
        // 除掉停用词
        if (term.nature == null)
            continue;
        String nature = term.nature.toString();
        char firstChar = nature.charAt(0);
        switch(firstChar) {
            case 'm':
                {
                    if (!TextUtility.isAllChinese(term.word))
                        continue;
                }
                break;
            case 'w':
                {
                    continue;
                }
        }
        // 停用词
        if (CoreStopWordDictionary.contains(term.word))
            continue;
        Long[] item = get(term.word);
        //            logger.trace("{} {}", wordResult.word, Arrays.toString(item));
        if (item == null) {
            if (withUndefinedItem) {
                item = new Long[] { Long.MAX_VALUE / 3 };
                synonymItemList.add(item);
            }
        } else {
            synonymItemList.add(item);
        }
    }
    return synonymItemList;
}
Also used : ArrayList(java.util.ArrayList) Term(com.hankcs.hanlp.seg.common.Term)

Example 40 with Term

use of com.hankcs.hanlp.seg.common.Term in project HanLP by hankcs.

the class CommonSynonymDictionary method rewrite.

public String rewrite(String text) {
    List<Term> termList = StandardTokenizer.segment(text.toCharArray());
    StringBuilder sbOut = new StringBuilder((int) (text.length() * 1.2));
    String preWord = Predefine.TAG_BIGIN;
    for (Term term : termList) {
        SynonymItem synonymItem = get(term.word);
        Synonym synonym;
        if (synonymItem != null && (synonym = synonymItem.randomSynonym(Type.EQUAL, preWord)) != null) {
            sbOut.append(synonym.realWord);
        } else
            sbOut.append(term.word);
        preWord = PosTagCompiler.compile(term.nature.toString(), term.word);
    }
    return sbOut.toString();
}
Also used : Synonym(com.hankcs.hanlp.corpus.synonym.Synonym) Term(com.hankcs.hanlp.seg.common.Term)

Aggregations

Term (com.hankcs.hanlp.seg.common.Term)48 Segment (com.hankcs.hanlp.seg.Segment)12 DijkstraSegment (com.hankcs.hanlp.seg.Dijkstra.DijkstraSegment)8 LinkedList (java.util.LinkedList)7 CRFSegment (com.hankcs.hanlp.seg.CRF.CRFSegment)5 ResultTerm (com.hankcs.hanlp.seg.common.ResultTerm)5 Vertex (com.hankcs.hanlp.seg.common.Vertex)5 CoNLLSentence (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence)4 CoNLLWord (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord)4 DoubleArrayTrieSegment (com.hankcs.hanlp.seg.Other.DoubleArrayTrieSegment)4 ViterbiSegment (com.hankcs.hanlp.seg.Viterbi.ViterbiSegment)4 ArrayList (java.util.ArrayList)4 Nature (com.hankcs.hanlp.corpus.tag.Nature)3 CoreDictionary (com.hankcs.hanlp.dictionary.CoreDictionary)3 AhoCorasickDoubleArrayTrie (com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie)2 Filter (com.hankcs.hanlp.dictionary.stopword.Filter)2 Table (com.hankcs.hanlp.model.crf.Table)2 HMMSegment (com.hankcs.hanlp.seg.HMM.HMMSegment)2 AtomNode (com.hankcs.hanlp.seg.NShort.Path.AtomNode)2 File (java.io.File)2