Search in sources :

Example 11 with IWord

use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.

the class NSDictionaryMaker method roleTag.

@Override
protected void roleTag(List<List<IWord>> sentenceList) {
    int i = 0;
    for (List<IWord> wordList : sentenceList) {
        Precompiler.compileWithoutNS(wordList);
        if (verbose) {
            System.out.print(++i + " / " + sentenceList.size() + " ");
            System.out.println("原始语料 " + wordList);
        }
        LinkedList<IWord> wordLinkedList = (LinkedList<IWord>) wordList;
        wordLinkedList.addFirst(new Word(Predefine.TAG_BIGIN, "S"));
        wordLinkedList.addLast(new Word(Predefine.TAG_END, "Z"));
        if (verbose)
            System.out.println("添加首尾 " + wordList);
        // 标注上文
        Iterator<IWord> iterator = wordLinkedList.iterator();
        IWord pre = iterator.next();
        while (iterator.hasNext()) {
            IWord current = iterator.next();
            if (current.getLabel().startsWith("ns") && !pre.getLabel().startsWith("ns")) {
                pre.setLabel(NS.A.toString());
            }
            pre = current;
        }
        if (verbose)
            System.out.println("标注上文 " + wordList);
        // 标注下文
        iterator = wordLinkedList.descendingIterator();
        pre = iterator.next();
        while (iterator.hasNext()) {
            IWord current = iterator.next();
            if (current.getLabel().startsWith("ns") && !pre.getLabel().startsWith("ns")) {
                pre.setLabel(NS.B.toString());
            }
            pre = current;
        }
        if (verbose)
            System.out.println("标注下文 " + wordList);
        // 标注中间
        iterator = wordLinkedList.iterator();
        IWord first = iterator.next();
        IWord second = iterator.next();
        while (iterator.hasNext()) {
            IWord third = iterator.next();
            if (first.getLabel().startsWith("ns") && third.getLabel().startsWith("ns") && !second.getLabel().startsWith("ns")) {
                second.setLabel(NS.X.toString());
            }
            first = second;
            second = third;
        }
        if (verbose)
            System.out.println("标注中间 " + wordList);
        // 拆分地名
        CorpusUtil.spilt(wordList);
        if (verbose)
            System.out.println("拆分地名 " + wordList);
        // 处理整个
        ListIterator<IWord> listIterator = wordLinkedList.listIterator();
        while (listIterator.hasNext()) {
            IWord word = listIterator.next();
            String label = word.getLabel();
            if (label.equals(label.toUpperCase()))
                continue;
            if (label.startsWith("ns")) {
                String value = word.getValue();
                int longestSuffixLength = PlaceSuffixDictionary.dictionary.getLongestSuffixLength(value);
                int wordLength = value.length() - longestSuffixLength;
                if (longestSuffixLength == 0 || wordLength == 0) {
                    word.setLabel(NS.G.toString());
                    continue;
                }
                listIterator.remove();
                if (wordLength > 3) {
                    listIterator.add(new Word(value.substring(0, wordLength), NS.G.toString()));
                    listIterator.add(new Word(value.substring(wordLength), NS.H.toString()));
                    continue;
                }
                for (int l = 1, tag = NS.C.ordinal(); l <= wordLength; ++l, ++tag) {
                    listIterator.add(new Word(value.substring(l - 1, l), NS.values()[tag].toString()));
                }
                listIterator.add(new Word(value.substring(wordLength), NS.H.toString()));
            } else {
                word.setLabel(NS.Z.toString());
            }
        }
        if (verbose)
            System.out.println("处理整个 " + wordList);
    }
}
Also used : Word(com.hankcs.hanlp.corpus.document.sentence.word.Word) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord) LinkedList(java.util.LinkedList)

Example 12 with IWord

use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.

the class NTDictionaryMaker method roleTag.

@Override
protected void roleTag(List<List<IWord>> sentenceList) {
    int i = 0;
    for (List<IWord> wordList : sentenceList) {
        Precompiler.compileWithoutNT(wordList);
        if (verbose) {
            System.out.print(++i + " / " + sentenceList.size() + " ");
            System.out.println("原始语料 " + wordList);
        }
        LinkedList<IWord> wordLinkedList = (LinkedList<IWord>) wordList;
        wordLinkedList.addFirst(new Word(Predefine.TAG_BIGIN, "S"));
        wordLinkedList.addLast(new Word(Predefine.TAG_END, "Z"));
        if (verbose)
            System.out.println("添加首尾 " + wordList);
        // 标注上文
        Iterator<IWord> iterator = wordLinkedList.iterator();
        IWord pre = iterator.next();
        while (iterator.hasNext()) {
            IWord current = iterator.next();
            if (current.getLabel().startsWith("nt") && !pre.getLabel().startsWith("nt")) {
                pre.setLabel(NT.A.toString());
            }
            pre = current;
        }
        if (verbose)
            System.out.println("标注上文 " + wordList);
        // 标注下文
        iterator = wordLinkedList.descendingIterator();
        pre = iterator.next();
        while (iterator.hasNext()) {
            IWord current = iterator.next();
            if (current.getLabel().startsWith("nt") && !pre.getLabel().startsWith("nt")) {
                pre.setLabel(NT.B.toString());
            }
            pre = current;
        }
        if (verbose)
            System.out.println("标注下文 " + wordList);
        // 标注中间
        {
            iterator = wordLinkedList.iterator();
            IWord first = iterator.next();
            IWord second = iterator.next();
            while (iterator.hasNext()) {
                IWord third = iterator.next();
                if (first.getLabel().startsWith("nt") && third.getLabel().startsWith("nt") && !second.getLabel().startsWith("nt")) {
                    second.setLabel(NT.X.toString());
                }
                first = second;
                second = third;
            }
            if (verbose)
                System.out.println("标注中间 " + wordList);
        }
        // 处理整个
        ListIterator<IWord> listIterator = wordLinkedList.listIterator();
        while (listIterator.hasNext()) {
            IWord word = listIterator.next();
            String label = word.getLabel();
            if (label.equals(label.toUpperCase()))
                continue;
            if (label.startsWith("nt")) {
                StringBuilder sbPattern = new StringBuilder();
                // 复杂机构
                if (word instanceof CompoundWord) {
                    listIterator.remove();
                    Word last = null;
                    for (Word inner : ((CompoundWord) word).innerList) {
                        last = inner;
                        String innerLabel = inner.label;
                        if (innerLabel.startsWith("ns")) {
                            inner.setValue(Predefine.TAG_PLACE);
                            inner.setLabel(NT.G.toString());
                            listIterator.add(inner);
                            sbPattern.append(inner.label);
                        } else if (innerLabel.startsWith("nt")) {
                            inner.value = Predefine.TAG_GROUP;
                            inner.label = NT.K.toString();
                            listIterator.add(inner);
                            sbPattern.append(inner.label);
                        } else if (innerLabel.equals("b") || innerLabel.equals("ng") || innerLabel.equals("j")) {
                            inner.label = NT.J.toString();
                            listIterator.add(inner);
                            sbPattern.append(inner.label);
                        } else if ("n".equals(innerLabel) || "an".equals(innerLabel) || "a".equals(innerLabel) || "vn".equals(innerLabel) || "vd".equals(innerLabel) || "vl".equals(innerLabel) || "v".equals(innerLabel) || "vi".equals(innerLabel) || "nnt".equals(innerLabel) || "nnd".equals(innerLabel) || "nf".equals(innerLabel) || "cc".equals(innerLabel) || "t".equals(innerLabel) || "z".equals(innerLabel)) {
                            inner.label = NT.C.toString();
                            listIterator.add(inner);
                            sbPattern.append(inner.label);
                        } else if ("nz".equals(innerLabel)) {
                            inner.label = NT.I.toString();
                            listIterator.add(inner);
                            sbPattern.append(inner.label);
                        } else if ("m".equals(innerLabel)) {
                            inner.value = Predefine.TAG_NUMBER;
                            inner.label = NT.M.toString();
                            listIterator.add(inner);
                            sbPattern.append(inner.label);
                        } else if ("w".equals(innerLabel)) {
                            inner.label = NT.W.toString();
                            listIterator.add(inner);
                            sbPattern.append(inner.label);
                        } else if (innerLabel.startsWith("nr") || "x".equals(innerLabel) || "nx".equals(innerLabel)) {
                            inner.value = Predefine.TAG_PEOPLE;
                            inner.label = NT.F.toString();
                            listIterator.add(inner);
                            sbPattern.append(inner.label);
                        } else if (innerLabel.startsWith("ni")) {
                            inner.label = NT.D.toString();
                            listIterator.add(inner);
                            sbPattern.append(inner.label);
                        } else if ("f".equals(innerLabel) || "s".equals(innerLabel)) {
                            inner.label = NT.L.toString();
                            listIterator.add(inner);
                            sbPattern.append(inner.label);
                        } else {
                            inner.label = NT.P.toString();
                            listIterator.add(inner);
                            sbPattern.append(inner.label);
                        }
                    }
                    if (last != null) {
                        last.label = NT.D.toString();
                        sbPattern.deleteCharAt(sbPattern.length() - 1);
                        sbPattern.append(last.label);
                        tfDictionary.add(sbPattern.toString());
                        sbPattern.setLength(0);
                    }
                } else {
                    word.setLabel(NT.K.toString());
                }
            } else {
                word.setLabel(NT.Z.toString());
            }
        }
        if (verbose)
            System.out.println("处理整个 " + wordList);
        wordLinkedList.getFirst().setLabel(NT.S.toString());
    }
}
Also used : CompoundWord(com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord) Word(com.hankcs.hanlp.corpus.document.sentence.word.Word) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord) CompoundWord(com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord) LinkedList(java.util.LinkedList)

Example 13 with IWord

use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.

the class NatureDictionaryMaker method addToDictionary.

@Override
protected void addToDictionary(List<List<IWord>> sentenceList) {
    logger.info("开始制作词典");
    // 制作NGram词典
    for (List<IWord> wordList : sentenceList) {
        IWord pre = null;
        for (IWord word : wordList) {
            // 制作词性词频词典
            dictionaryMaker.add(word);
            if (pre != null) {
                nGramDictionaryMaker.addPair(pre, word);
            }
            pre = word;
        }
    }
}
Also used : IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord)

Example 14 with IWord

use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.

the class CorpusLoader method convert2SentenceList.

public static List<List<IWord>> convert2SentenceList(String path) {
    List<Document> documentList = CorpusLoader.convert2DocumentList(path);
    List<List<IWord>> simpleList = new LinkedList<List<IWord>>();
    for (Document document : documentList) {
        for (Sentence sentence : document.sentenceList) {
            simpleList.add(sentence.wordList);
        }
    }
    return simpleList;
}
Also used : List(java.util.List) LinkedList(java.util.LinkedList) Sentence(com.hankcs.hanlp.corpus.document.sentence.Sentence) LinkedList(java.util.LinkedList) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord)

Example 15 with IWord

use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.

the class CharacterBasedGenerativeModel method learn.

/**
     * 让模型观测一个句子
     * @param wordList
     */
public void learn(List<Word> wordList) {
    LinkedList<char[]> sentence = new LinkedList<char[]>();
    for (IWord iWord : wordList) {
        String word = iWord.getValue();
        if (word.length() == 1) {
            sentence.add(new char[] { word.charAt(0), 's' });
        } else {
            sentence.add(new char[] { word.charAt(0), 'b' });
            for (int i = 1; i < word.length() - 1; ++i) {
                sentence.add(new char[] { word.charAt(i), 'm' });
            }
            sentence.add(new char[] { word.charAt(word.length() - 1), 'e' });
        }
    }
    // 转换完毕,开始统计
    // 定长3的队列
    char[][] now = new char[3][];
    now[1] = bos;
    now[2] = bos;
    tf.add(1, bos, bos);
    tf.add(2, bos);
    for (char[] i : sentence) {
        System.arraycopy(now, 1, now, 0, 2);
        now[2] = i;
        // uni
        tf.add(1, i);
        // bi
        tf.add(1, now[1], now[2]);
        // tri
        tf.add(1, now);
    }
}
Also used : LinkedList(java.util.LinkedList) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord)

Aggregations

IWord (com.hankcs.hanlp.corpus.document.sentence.word.IWord)17 LinkedList (java.util.LinkedList)11 Word (com.hankcs.hanlp.corpus.document.sentence.word.Word)8 List (java.util.List)8 CompoundWord (com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord)7 CorpusLoader (com.hankcs.hanlp.corpus.document.CorpusLoader)4 Document (com.hankcs.hanlp.corpus.document.Document)4 Sentence (com.hankcs.hanlp.corpus.document.sentence.Sentence)4 DictionaryMaker (com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)3 TFDictionary (com.hankcs.hanlp.corpus.dictionary.TFDictionary)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1