Search in sources :

Example 1 with Word

use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.

the class WordNatureWeightModelMaker method addPair.

private static void addPair(String from, String to, String label, DictionaryMaker dictionaryMaker) {
    dictionaryMaker.add(new Word(from + "@" + to, label));
    dictionaryMaker.add(new Word(from + "@", "频次"));
}
Also used : Word(com.hankcs.hanlp.corpus.document.sentence.word.Word) CoNLLWord(com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord)

Example 2 with Word

use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.

the class TestICWB method testDumpPeople2014ToBEMS.

public void testDumpPeople2014ToBEMS() throws Exception {
    final BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("D:\\Tools\\CRF++-0.58\\example\\seg_cn\\2014.txt")));
    CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {

        @Override
        public void handle(Document document) {
            List<List<Word>> simpleSentenceList = document.getSimpleSentenceList();
            for (List<Word> wordList : simpleSentenceList) {
                try {
                    for (Word word : wordList) {
                        bw.write(word.value);
                        bw.write(' ');
                    }
                    bw.newLine();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    });
    bw.close();
}
Also used : Word(com.hankcs.hanlp.corpus.document.sentence.word.Word) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord) FileOutputStream(java.io.FileOutputStream) CorpusLoader(com.hankcs.hanlp.corpus.document.CorpusLoader) OutputStreamWriter(java.io.OutputStreamWriter) List(java.util.List) LinkedList(java.util.LinkedList) IOException(java.io.IOException) Document(com.hankcs.hanlp.corpus.document.Document) BufferedWriter(java.io.BufferedWriter)

Example 3 with Word

use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.

the class NameDictionaryMaker method create.

public static DictionaryMaker create(String path) {
    DictionaryMaker dictionaryMaker = new DictionaryMaker();
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
        String line;
        while ((line = br.readLine()) != null) {
            if (line.matches(".*[\\p{P}+~$`^=|<>~`$^+=|<>¥×|\\s|a-z0-9A-Z]+.*"))
                continue;
            // 只载入两字和三字的名字
            Integer length = line.length();
            switch(length) {
                case 2:
                    {
                        Word wordB = new Word(line.substring(0, 1), NR.B.toString());
                        if (!FamilyName.contains(wordB.value))
                            break;
                        Word wordE = new Word(line.substring(1), NR.E.toString());
                        dictionaryMaker.add(wordB);
                        dictionaryMaker.add(wordE);
                        break;
                    }
                case 3:
                    {
                        Word wordB = new Word(line.substring(0, 1), NR.B.toString());
                        if (!FamilyName.contains(wordB.value))
                            break;
                        Word wordC = new Word(line.substring(1, 2), NR.C.toString());
                        Word wordD = new Word(line.substring(2, 3), NR.D.toString());
                        //                        Word wordC = new Word(line.substring(1, 2), NR.E.toString());
                        //                        Word wordD = new Word(line.substring(2, 3), NR.E.toString());
                        dictionaryMaker.add(wordB);
                        dictionaryMaker.add(wordC);
                        dictionaryMaker.add(wordD);
                        break;
                    }
                default:
                    //                        L.trace("放弃【{}】", line);
                    break;
            }
        }
        br.close();
        logger.info(dictionaryMaker.toString());
    } catch (Exception e) {
        logger.warning("读取" + path + "发生错误");
        return null;
    }
    return dictionaryMaker;
}
Also used : Word(com.hankcs.hanlp.corpus.document.sentence.word.Word) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker) FileInputStream(java.io.FileInputStream)

Example 4 with Word

use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.

the class NRDictionaryMaker method roleTag.

@Override
protected void roleTag(List<List<IWord>> sentenceList) {
    logger.info("开始标注角色");
    int i = 0;
    for (List<IWord> wordList : sentenceList) {
        logger.info(++i + " / " + sentenceList.size());
        if (verbose)
            System.out.println("原始语料 " + wordList);
        // 先标注A和K
        IWord pre = new Word("##始##", "begin");
        ListIterator<IWord> listIterator = wordList.listIterator();
        while (listIterator.hasNext()) {
            IWord word = listIterator.next();
            if (!word.getLabel().equals(Nature.nr.toString())) {
                word.setLabel(NR.A.toString());
            } else {
                if (!pre.getLabel().equals(Nature.nr.toString())) {
                    pre.setLabel(NR.K.toString());
                }
            }
            pre = word;
        }
        if (verbose)
            System.out.println("标注非前 " + wordList);
        // 然后标注LM
        IWord next = new Word("##末##", "end");
        while (listIterator.hasPrevious()) {
            IWord word = listIterator.previous();
            if (word.getLabel().equals(Nature.nr.toString())) {
                String label = next.getLabel();
                if (label.equals("A"))
                    next.setLabel("L");
                else if (label.equals("K"))
                    next.setLabel("M");
            }
            next = word;
        }
        if (verbose)
            System.out.println("标注中后 " + wordList);
        // 拆分名字
        listIterator = wordList.listIterator();
        while (listIterator.hasNext()) {
            IWord word = listIterator.next();
            if (word.getLabel().equals(Nature.nr.toString())) {
                switch(word.getValue().length()) {
                    case 2:
                        if (word.getValue().startsWith("大") || word.getValue().startsWith("老") || word.getValue().startsWith("小")) {
                            listIterator.add(new Word(word.getValue().substring(1, 2), NR.B.toString()));
                            word.setValue(word.getValue().substring(0, 1));
                            word.setLabel(NR.F.toString());
                        } else if (word.getValue().endsWith("哥") || word.getValue().endsWith("公") || word.getValue().endsWith("姐") || word.getValue().endsWith("老") || word.getValue().endsWith("某") || word.getValue().endsWith("嫂") || word.getValue().endsWith("氏") || word.getValue().endsWith("总")) {
                            listIterator.add(new Word(word.getValue().substring(1, 2), NR.G.toString()));
                            word.setValue(word.getValue().substring(0, 1));
                            word.setLabel(NR.B.toString());
                        } else {
                            listIterator.add(new Word(word.getValue().substring(1, 2), NR.E.toString()));
                            word.setValue(word.getValue().substring(0, 1));
                            word.setLabel(NR.B.toString());
                        }
                        break;
                    case 3:
                        listIterator.add(new Word(word.getValue().substring(1, 2), NR.C.toString()));
                        listIterator.add(new Word(word.getValue().substring(2, 3), NR.D.toString()));
                        word.setValue(word.getValue().substring(0, 1));
                        word.setLabel(NR.B.toString());
                        break;
                }
            }
        }
        if (verbose)
            System.out.println("姓名拆分 " + wordList);
        // 上文成词
        listIterator = wordList.listIterator();
        pre = new Word("##始##", "begin");
        while (listIterator.hasNext()) {
            IWord word = listIterator.next();
            if (word.getLabel().equals(NR.B.toString())) {
                String combine = pre.getValue() + word.getValue();
                if (dictionary.contains(combine)) {
                    pre.setValue(combine);
                    pre.setLabel("U");
                    listIterator.remove();
                }
            }
            pre = word;
        }
        if (verbose)
            System.out.println("上文成词 " + wordList);
        // 头部成词
        next = new Word("##末##", "end");
        while (listIterator.hasPrevious()) {
            IWord word = listIterator.previous();
            if (word.getLabel().equals(NR.B.toString())) {
                String combine = word.getValue() + next.getValue();
                if (dictionary.contains(combine)) {
                    next.setValue(combine);
                    next.setLabel(next.getLabel().equals(NR.C.toString()) ? NR.X.toString() : NR.Y.toString());
                    listIterator.remove();
                }
            }
            next = word;
        }
        if (verbose)
            System.out.println("头部成词 " + wordList);
        // 尾部成词
        pre = new Word("##始##", "begin");
        while (listIterator.hasNext()) {
            IWord word = listIterator.next();
            if (word.getLabel().equals(NR.D.toString())) {
                String combine = pre.getValue() + word.getValue();
                if (dictionary.contains(combine)) {
                    pre.setValue(combine);
                    pre.setLabel(NR.Z.toString());
                    listIterator.remove();
                }
            }
            pre = word;
        }
        if (verbose)
            System.out.println("尾部成词 " + wordList);
        // 下文成词
        next = new Word("##末##", "end");
        while (listIterator.hasPrevious()) {
            IWord word = listIterator.previous();
            if (word.getLabel().equals(NR.D.toString())) {
                String combine = word.getValue() + next.getValue();
                if (dictionary.contains(combine)) {
                    next.setValue(combine);
                    next.setLabel(NR.V.toString());
                    listIterator.remove();
                }
            }
            next = word;
        }
        if (verbose)
            System.out.println("头部成词 " + wordList);
        LinkedList<IWord> wordLinkedList = (LinkedList<IWord>) wordList;
        wordLinkedList.addFirst(new Word(Predefine.TAG_BIGIN, "S"));
        wordLinkedList.addLast(new Word(Predefine.TAG_END, "A"));
        if (verbose)
            System.out.println("添加首尾 " + wordList);
    }
}
Also used : Word(com.hankcs.hanlp.corpus.document.sentence.word.Word) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord) LinkedList(java.util.LinkedList)

Example 5 with Word

use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.

the class NatureDictionaryMaker method roleTag.

@Override
protected void roleTag(List<List<IWord>> sentenceList) {
    logger.info("开始标注");
    int i = 0;
    for (List<IWord> wordList : sentenceList) {
        logger.info(++i + " / " + sentenceList.size());
        for (IWord word : wordList) {
            // 编译为等效字符串
            Precompiler.compile(word);
        }
        LinkedList<IWord> wordLinkedList = (LinkedList<IWord>) wordList;
        wordLinkedList.addFirst(new Word(Predefine.TAG_BIGIN, Nature.begin.toString()));
        wordLinkedList.addLast(new Word(Predefine.TAG_END, Nature.end.toString()));
    }
}
Also used : Word(com.hankcs.hanlp.corpus.document.sentence.word.Word) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord) LinkedList(java.util.LinkedList)

Aggregations

Word (com.hankcs.hanlp.corpus.document.sentence.word.Word)12 IWord (com.hankcs.hanlp.corpus.document.sentence.word.IWord)9 LinkedList (java.util.LinkedList)9 CompoundWord (com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord)5 List (java.util.List)4 Sentence (com.hankcs.hanlp.corpus.document.sentence.Sentence)3 DictionaryMaker (com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)2 BufferedReader (java.io.BufferedReader)2 FileInputStream (java.io.FileInputStream)2 InputStreamReader (java.io.InputStreamReader)2 CoNLLWord (com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord)1 Item (com.hankcs.hanlp.corpus.dictionary.item.Item)1 CorpusLoader (com.hankcs.hanlp.corpus.document.CorpusLoader)1 Document (com.hankcs.hanlp.corpus.document.Document)1 BufferedWriter (java.io.BufferedWriter)1 FileOutputStream (java.io.FileOutputStream)1 IOException (java.io.IOException)1 OutputStreamWriter (java.io.OutputStreamWriter)1