Search in sources :

Example 6 with Document

use of com.hankcs.hanlp.corpus.document.Document in project HanLP by hankcs.

the class TestMakeCompanyCorpus method testParse.

public void testParse() throws Exception {
    EasyDictionary dictionary = EasyDictionary.create("data/dictionary/2014_dictionary.txt");
    final NTDictionaryMaker nsDictionaryMaker = new NTDictionaryMaker(dictionary);
    // CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014\\", new CorpusLoader.Handler()
    CorpusLoader.walk("data/test/nt/part/", new CorpusLoader.Handler() {

        @Override
        public void handle(Document document) {
            nsDictionaryMaker.compute(document.getComplexSentenceList());
        }
    });
    nsDictionaryMaker.saveTxtTo("D:\\JavaProjects\\HanLP\\data\\dictionary\\organization\\outerNT");
}
Also used : EasyDictionary(com.hankcs.hanlp.corpus.dictionary.EasyDictionary) CorpusLoader(com.hankcs.hanlp.corpus.document.CorpusLoader) NTDictionaryMaker(com.hankcs.hanlp.corpus.dictionary.NTDictionaryMaker) Document(com.hankcs.hanlp.corpus.document.Document)

Example 7 with Document

use of com.hankcs.hanlp.corpus.document.Document in project HanLP by hankcs.

the class TestDictionaryMaker method testMakeDictionary.

public void testMakeDictionary() throws Exception {
    final DictionaryMaker dictionaryMaker = new DictionaryMaker();
    CorpusLoader.walk("data/2014", new CorpusLoader.Handler() {

        @Override
        public void handle(Document document) {
            addToDictionary(document, dictionaryMaker);
        }
    });
    dictionaryMaker.saveTxtTo("data/2014_dictionary.txt");
}
Also used : CorpusLoader(com.hankcs.hanlp.corpus.document.CorpusLoader) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker) Document(com.hankcs.hanlp.corpus.document.Document)

Example 8 with Document

use of com.hankcs.hanlp.corpus.document.Document in project HanLP by hankcs.

the class TestDictionaryMaker method testSingleDocument.

public void testSingleDocument() throws Exception {
    Document document = CorpusLoader.convert2Document(new File("data/2014/0101/c1002-23996898.txt"));
    DictionaryMaker dictionaryMaker = new DictionaryMaker();
    System.out.println(document);
    addToDictionary(document, dictionaryMaker);
    dictionaryMaker.saveTxtTo("data/dictionaryTest.txt");
}
Also used : DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker) Document(com.hankcs.hanlp.corpus.document.Document) File(java.io.File)

Example 9 with Document

use of com.hankcs.hanlp.corpus.document.Document in project HanLP by hankcs.

the class TestAdjustCoreDictionary method testSimplifyNZ.

public void testSimplifyNZ() throws Exception {
    final DictionaryMaker nzDictionary = new DictionaryMaker();
    CorpusLoader.walk("D:\\Doc\\语料库\\2014", new CorpusLoader.Handler() {

        @Override
        public void handle(Document document) {
            for (List<IWord> sentence : document.getComplexSentenceList()) {
                for (IWord word : sentence) {
                    if (word instanceof CompoundWord && "nz".equals(word.getLabel())) {
                        nzDictionary.add(word);
                    }
                }
            }
        }
    });
    nzDictionary.saveTxtTo("data/test/nz.txt");
}
Also used : CorpusLoader(com.hankcs.hanlp.corpus.document.CorpusLoader) DictionaryMaker(com.hankcs.hanlp.corpus.dictionary.DictionaryMaker) List(java.util.List) Document(com.hankcs.hanlp.corpus.document.Document) CompoundWord(com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord) IWord(com.hankcs.hanlp.corpus.document.sentence.word.IWord)

Aggregations

Document (com.hankcs.hanlp.corpus.document.Document)9 CorpusLoader (com.hankcs.hanlp.corpus.document.CorpusLoader)8 List (java.util.List)6 DictionaryMaker (com.hankcs.hanlp.corpus.dictionary.DictionaryMaker)5 IWord (com.hankcs.hanlp.corpus.document.sentence.word.IWord)5 CompoundWord (com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord)2 LinkedList (java.util.LinkedList)2 EasyDictionary (com.hankcs.hanlp.corpus.dictionary.EasyDictionary)1 NTDictionaryMaker (com.hankcs.hanlp.corpus.dictionary.NTDictionaryMaker)1 TFDictionary (com.hankcs.hanlp.corpus.dictionary.TFDictionary)1 Word (com.hankcs.hanlp.corpus.document.sentence.word.Word)1 CharacterBasedGenerativeModel (com.hankcs.hanlp.model.trigram.CharacterBasedGenerativeModel)1 BufferedWriter (java.io.BufferedWriter)1 File (java.io.File)1 FileOutputStream (java.io.FileOutputStream)1 IOException (java.io.IOException)1 OutputStreamWriter (java.io.OutputStreamWriter)1