Search in sources :

Example 1 with Document

use of com.hankcs.hanlp.corpus.document.Document in project HanLP by hankcs.

the class TestCharacterBasedGenerativeModel method testTrainAndSegment.

public void testTrainAndSegment() throws Exception {
    final CharacterBasedGenerativeModel model = new CharacterBasedGenerativeModel();
    CorpusLoader.walk("D:\\JavaProjects\\HanLP\\data\\test\\cbgm", new CorpusLoader.Handler() {

        @Override
        public void handle(Document document) {
            for (List<Word> sentence : document.getSimpleSentenceList()) {
                model.learn(sentence);
            }
        }
    });
    model.train();
    //        DataOutputStream out = new DataOutputStream(new FileOutputStream(HanLP.Config.HMMSegmentModelPath));
    //        model.save(out);
    //        out.close();
    //        model.load(ByteArray.createByteArray(HanLP.Config.HMMSegmentModelPath));
    String text = "中国领土";
    char[] charArray = text.toCharArray();
    char[] tag = model.tag(charArray);
    System.out.println(tag);
}
Also used : CharacterBasedGenerativeModel(com.hankcs.hanlp.model.trigram.CharacterBasedGenerativeModel) CorpusLoader(com.hankcs.hanlp.corpus.document.CorpusLoader) List(java.util.List) LinkedList(java.util.LinkedList) Document(com.hankcs.hanlp.corpus.document.Document)

Example 2 with Document

use of com.hankcs.hanlp.corpus.document.Document in project HanLP by hankcs.

the class TestCorpusLoader method testMakeOrganizationCustomDictionary.

public void testMakeOrganizationCustomDictionary() throws Exception {
    final DictionaryMaker dictionaryMaker = new DictionaryMaker();
    CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {

        @Override
        public void handle(Document document) {
            List<List<IWord>> complexSentenceList = document.getComplexSentenceList();
            for (List<IWord> wordList : complexSentenceList) {
                for (IWord word : wordList) {
                    if (word.getLabel().startsWith("nt")) {
                        dictionaryMaker.add(word);
                    }
                }
            }
        }
    });
    dictionaryMaker.saveTxtTo("data/dictionary/custom/机构名词典.txt");
}
Also used : CorpusLoader(com.hankcs.hanlp.corpus.document.CorpusLoader) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker) List(java.util.List) Document(com.hankcs.hanlp.corpus.document.Document) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord)

Example 3 with Document

use of com.hankcs.hanlp.corpus.document.Document in project HanLP by hankcs.

the class TestICWB method testDumpPeople2014ToBEMS.

public void testDumpPeople2014ToBEMS() throws Exception {
    final BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("D:\\Tools\\CRF++-0.58\\example\\seg_cn\\2014.txt")));
    CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {

        @Override
        public void handle(Document document) {
            List<List<Word>> simpleSentenceList = document.getSimpleSentenceList();
            for (List<Word> wordList : simpleSentenceList) {
                try {
                    for (Word word : wordList) {
                        bw.write(word.value);
                        bw.write(' ');
                    }
                    bw.newLine();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    });
    bw.close();
}
Also used : Word(com.hankcs.hanlp.corpus.document.sentence.word.Word) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord) FileOutputStream(java.io.FileOutputStream) CorpusLoader(com.hankcs.hanlp.corpus.document.CorpusLoader) OutputStreamWriter(java.io.OutputStreamWriter) List(java.util.List) LinkedList(java.util.LinkedList) IOException(java.io.IOException) Document(com.hankcs.hanlp.corpus.document.Document) BufferedWriter(java.io.BufferedWriter)

Example 4 with Document

use of com.hankcs.hanlp.corpus.document.Document in project HanLP by hankcs.

the class TestCorpusLoader method testMakePersonCustomDictionary.

public void testMakePersonCustomDictionary() throws Exception {
    final DictionaryMaker dictionaryMaker = new DictionaryMaker();
    CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {

        @Override
        public void handle(Document document) {
            List<List<IWord>> complexSentenceList = document.getComplexSentenceList();
            for (List<IWord> wordList : complexSentenceList) {
                for (IWord word : wordList) {
                    if (word.getLabel().startsWith("nr")) {
                        dictionaryMaker.add(word);
                    }
                }
            }
        }
    });
    dictionaryMaker.saveTxtTo("data/dictionary/custom/人名词典.txt");
}
Also used : CorpusLoader(com.hankcs.hanlp.corpus.document.CorpusLoader) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker) List(java.util.List) Document(com.hankcs.hanlp.corpus.document.Document) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord)

Example 5 with Document

use of com.hankcs.hanlp.corpus.document.Document in project HanLP by hankcs.

the class AdjustCorpus method testPlay.

public void testPlay() throws Exception {
    final TFDictionary tfDictionary = new TFDictionary();
    CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {

        @Override
        public void handle(Document document) {
            for (List<IWord> wordList : document.getComplexSentenceList()) {
                for (IWord word : wordList) {
                    if (word instanceof CompoundWord && word.getLabel().equals("ns")) {
                        tfDictionary.add(word.toString());
                    }
                }
            }
        }
    });
    tfDictionary.saveTxtTo("data/test/complex_ns.txt");
}
Also used : TFDictionary(com.hankcs.hanlp.corpus.dictionary.TFDictionary) CorpusLoader(com.hankcs.hanlp.corpus.document.CorpusLoader) List(java.util.List) Document(com.hankcs.hanlp.corpus.document.Document) CompoundWord(com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord)

Aggregations

Document (com.hankcs.hanlp.corpus.document.Document)9 CorpusLoader (com.hankcs.hanlp.corpus.document.CorpusLoader)8 List (java.util.List)6 DictionaryMaker (com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)5 IWord (com.hankcs.hanlp.corpus.document.sentence.word.IWord)5 CompoundWord (com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord)2 LinkedList (java.util.LinkedList)2 EasyDictionary (com.hankcs.hanlp.corpus.dictionary.EasyDictionary)1 NTDictionaryMaker (com.hankcs.hanlp.corpus.dictionary.NTDictionaryMaker)1 TFDictionary (com.hankcs.hanlp.corpus.dictionary.TFDictionary)1 Word (com.hankcs.hanlp.corpus.document.sentence.word.Word)1 CharacterBasedGenerativeModel (com.hankcs.hanlp.model.trigram.CharacterBasedGenerativeModel)1 BufferedWriter (java.io.BufferedWriter)1 File (java.io.File)1 FileOutputStream (java.io.FileOutputStream)1 IOException (java.io.IOException)1 OutputStreamWriter (java.io.OutputStreamWriter)1