Search in sources :

Example 6 with Word

use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.

the class Document method getSimpleSentenceList.

/**
     * 获取简单的句子列表,其中复合词会被拆分为简单词
     * @return
     */
public List<List<Word>> getSimpleSentenceList() {
    List<List<Word>> simpleList = new LinkedList<List<Word>>();
    for (Sentence sentence : sentenceList) {
        List<Word> wordList = new LinkedList<Word>();
        for (IWord word : sentence.wordList) {
            if (word instanceof CompoundWord) {
                for (Word inner : ((CompoundWord) word).innerList) {
                    wordList.add(inner);
                }
            } else {
                wordList.add((Word) word);
            }
        }
        simpleList.add(wordList);
    }
    return simpleList;
}
Also used : CompoundWord(com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord) Word(com.hankcs.hanlp.corpus.document.sentence.word.Word) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord) List(java.util.List) LinkedList(java.util.LinkedList) Sentence(com.hankcs.hanlp.corpus.document.sentence.Sentence) CompoundWord(com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord) LinkedList(java.util.LinkedList) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord)

Example 7 with Word

use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.

the class Document method getSimpleSentenceList.

/**
     * 获取简单的句子列表
     * @param spilt 如果为真,其中复合词会被拆分为简单词
     * @return
     */
public List<List<Word>> getSimpleSentenceList(boolean spilt) {
    List<List<Word>> simpleList = new LinkedList<List<Word>>();
    for (Sentence sentence : sentenceList) {
        List<Word> wordList = new LinkedList<Word>();
        for (IWord word : sentence.wordList) {
            if (word instanceof CompoundWord) {
                if (spilt) {
                    for (Word inner : ((CompoundWord) word).innerList) {
                        wordList.add(inner);
                    }
                } else {
                    wordList.add(((CompoundWord) word).toWord());
                }
            } else {
                wordList.add((Word) word);
            }
        }
        simpleList.add(wordList);
    }
    return simpleList;
}
Also used : CompoundWord(com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord) Word(com.hankcs.hanlp.corpus.document.sentence.word.Word) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord) List(java.util.List) LinkedList(java.util.LinkedList) Sentence(com.hankcs.hanlp.corpus.document.sentence.Sentence) CompoundWord(com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord) LinkedList(java.util.LinkedList) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord)

Example 8 with Word

use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.

the class Document method getSimpleSentenceList.

/**
     * 获取简单的句子列表,其中复合词的标签如果是set中指定的话会被拆分为简单词
     * @param labelSet
     * @return
     */
public List<List<Word>> getSimpleSentenceList(Set<String> labelSet) {
    List<List<Word>> simpleList = new LinkedList<List<Word>>();
    for (Sentence sentence : sentenceList) {
        List<Word> wordList = new LinkedList<Word>();
        for (IWord word : sentence.wordList) {
            if (word instanceof CompoundWord) {
                if (labelSet.contains(word.getLabel())) {
                    for (Word inner : ((CompoundWord) word).innerList) {
                        wordList.add(inner);
                    }
                } else {
                    wordList.add(((CompoundWord) word).toWord());
                }
            } else {
                wordList.add((Word) word);
            }
        }
        simpleList.add(wordList);
    }
    return simpleList;
}
Also used : CompoundWord(com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord) Word(com.hankcs.hanlp.corpus.document.sentence.word.Word) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord) List(java.util.List) LinkedList(java.util.LinkedList) Sentence(com.hankcs.hanlp.corpus.document.sentence.Sentence) CompoundWord(com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord) LinkedList(java.util.LinkedList) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord)

Example 9 with Word

use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.

the class TestICWB method testDumpPeople2014ToBEMS.

public void testDumpPeople2014ToBEMS() throws Exception {
    final BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("D:\\Tools\\CRF++-0.58\\example\\seg_cn\\2014.txt")));
    CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {

        @Override
        public void handle(Document document) {
            List<List<Word>> simpleSentenceList = document.getSimpleSentenceList();
            for (List<Word> wordList : simpleSentenceList) {
                try {
                    for (Word word : wordList) {
                        bw.write(word.value);
                        bw.write(' ');
                    }
                    bw.newLine();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    });
    bw.close();
}
Also used : Word(com.hankcs.hanlp.corpus.document.sentence.word.Word) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord) FileOutputStream(java.io.FileOutputStream) CorpusLoader(com.hankcs.hanlp.corpus.document.CorpusLoader) OutputStreamWriter(java.io.OutputStreamWriter) List(java.util.List) LinkedList(java.util.LinkedList) IOException(java.io.IOException) Document(com.hankcs.hanlp.corpus.document.Document) BufferedWriter(java.io.BufferedWriter)

Example 10 with Word

use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.

the class NSDictionaryMaker method roleTag.

@Override
protected void roleTag(List<List<IWord>> sentenceList) {
    int i = 0;
    for (List<IWord> wordList : sentenceList) {
        Precompiler.compileWithoutNS(wordList);
        if (verbose) {
            System.out.print(++i + " / " + sentenceList.size() + " ");
            System.out.println("原始语料 " + wordList);
        }
        LinkedList<IWord> wordLinkedList = (LinkedList<IWord>) wordList;
        wordLinkedList.addFirst(new Word(Predefine.TAG_BIGIN, "S"));
        wordLinkedList.addLast(new Word(Predefine.TAG_END, "Z"));
        if (verbose)
            System.out.println("添加首尾 " + wordList);
        // 标注上文
        Iterator<IWord> iterator = wordLinkedList.iterator();
        IWord pre = iterator.next();
        while (iterator.hasNext()) {
            IWord current = iterator.next();
            if (current.getLabel().startsWith("ns") && !pre.getLabel().startsWith("ns")) {
                pre.setLabel(NS.A.toString());
            }
            pre = current;
        }
        if (verbose)
            System.out.println("标注上文 " + wordList);
        // 标注下文
        iterator = wordLinkedList.descendingIterator();
        pre = iterator.next();
        while (iterator.hasNext()) {
            IWord current = iterator.next();
            if (current.getLabel().startsWith("ns") && !pre.getLabel().startsWith("ns")) {
                pre.setLabel(NS.B.toString());
            }
            pre = current;
        }
        if (verbose)
            System.out.println("标注下文 " + wordList);
        // 标注中间
        iterator = wordLinkedList.iterator();
        IWord first = iterator.next();
        IWord second = iterator.next();
        while (iterator.hasNext()) {
            IWord third = iterator.next();
            if (first.getLabel().startsWith("ns") && third.getLabel().startsWith("ns") && !second.getLabel().startsWith("ns")) {
                second.setLabel(NS.X.toString());
            }
            first = second;
            second = third;
        }
        if (verbose)
            System.out.println("标注中间 " + wordList);
        // 拆分地名
        CorpusUtil.spilt(wordList);
        if (verbose)
            System.out.println("拆分地名 " + wordList);
        // 处理整个
        ListIterator<IWord> listIterator = wordLinkedList.listIterator();
        while (listIterator.hasNext()) {
            IWord word = listIterator.next();
            String label = word.getLabel();
            if (label.equals(label.toUpperCase()))
                continue;
            if (label.startsWith("ns")) {
                String value = word.getValue();
                int longestSuffixLength = PlaceSuffixDictionary.dictionary.getLongestSuffixLength(value);
                int wordLength = value.length() - longestSuffixLength;
                if (longestSuffixLength == 0 || wordLength == 0) {
                    word.setLabel(NS.G.toString());
                    continue;
                }
                listIterator.remove();
                if (wordLength > 3) {
                    listIterator.add(new Word(value.substring(0, wordLength), NS.G.toString()));
                    listIterator.add(new Word(value.substring(wordLength), NS.H.toString()));
                    continue;
                }
                for (int l = 1, tag = NS.C.ordinal(); l <= wordLength; ++l, ++tag) {
                    listIterator.add(new Word(value.substring(l - 1, l), NS.values()[tag].toString()));
                }
                listIterator.add(new Word(value.substring(wordLength), NS.H.toString()));
            } else {
                word.setLabel(NS.Z.toString());
            }
        }
        if (verbose)
            System.out.println("处理整个 " + wordList);
    }
}
Also used : Word(com.hankcs.hanlp.corpus.document.sentence.word.Word) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord) LinkedList(java.util.LinkedList)

Aggregations

Word (com.hankcs.hanlp.corpus.document.sentence.word.Word)12 IWord (com.hankcs.hanlp.corpus.document.sentence.word.IWord)9 LinkedList (java.util.LinkedList)9 CompoundWord (com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord)5 List (java.util.List)4 Sentence (com.hankcs.hanlp.corpus.document.sentence.Sentence)3 DictionaryMaker (com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)2 BufferedReader (java.io.BufferedReader)2 FileInputStream (java.io.FileInputStream)2 InputStreamReader (java.io.InputStreamReader)2 CoNLLWord (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord)1 Item (com.hankcs.hanlp.corpus.dictionary.item.Item)1 CorpusLoader (com.hankcs.hanlp.corpus.document.CorpusLoader)1 Document (com.hankcs.hanlp.corpus.document.Document)1 BufferedWriter (java.io.BufferedWriter)1 FileOutputStream (java.io.FileOutputStream)1 IOException (java.io.IOException)1 OutputStreamWriter (java.io.OutputStreamWriter)1