use of com.hankcs.hanlp.corpus.document.Document in project HanLP by hankcs.
the class TestCharacterBasedGenerativeModel method testTrainAndSegment.
public void testTrainAndSegment() throws Exception {
final CharacterBasedGenerativeModel model = new CharacterBasedGenerativeModel();
CorpusLoader.walk("D:\\JavaProjects\\HanLP\\data\\test\\cbgm", new CorpusLoader.Handler() {
@Override
public void handle(Document document) {
for (List<Word> sentence : document.getSimpleSentenceList()) {
model.learn(sentence);
}
}
});
model.train();
// DataOutputStream out = new DataOutputStream(new FileOutputStream(HanLP.Config.HMMSegmentModelPath));
// model.save(out);
// out.close();
// model.load(ByteArray.createByteArray(HanLP.Config.HMMSegmentModelPath));
String text = "中国领土";
char[] charArray = text.toCharArray();
char[] tag = model.tag(charArray);
System.out.println(tag);
}
use of com.hankcs.hanlp.corpus.document.Document in project HanLP by hankcs.
the class TestCorpusLoader method testMakeOrganizationCustomDictionary.
public void testMakeOrganizationCustomDictionary() throws Exception {
final DictionaryMaker dictionaryMaker = new DictionaryMaker();
CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {
@Override
public void handle(Document document) {
List<List<IWord>> complexSentenceList = document.getComplexSentenceList();
for (List<IWord> wordList : complexSentenceList) {
for (IWord word : wordList) {
if (word.getLabel().startsWith("nt")) {
dictionaryMaker.add(word);
}
}
}
}
});
dictionaryMaker.saveTxtTo("data/dictionary/custom/机构名词典.txt");
}
use of com.hankcs.hanlp.corpus.document.Document in project HanLP by hankcs.
the class TestICWB method testDumpPeople2014ToBEMS.
public void testDumpPeople2014ToBEMS() throws Exception {
final BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("D:\\Tools\\CRF++-0.58\\example\\seg_cn\\2014.txt")));
CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {
@Override
public void handle(Document document) {
List<List<Word>> simpleSentenceList = document.getSimpleSentenceList();
for (List<Word> wordList : simpleSentenceList) {
try {
for (Word word : wordList) {
bw.write(word.value);
bw.write(' ');
}
bw.newLine();
} catch (IOException e) {
e.printStackTrace();
}
}
}
});
bw.close();
}
use of com.hankcs.hanlp.corpus.document.Document in project HanLP by hankcs.
the class TestCorpusLoader method testMakePersonCustomDictionary.
public void testMakePersonCustomDictionary() throws Exception {
final DictionaryMaker dictionaryMaker = new DictionaryMaker();
CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {
@Override
public void handle(Document document) {
List<List<IWord>> complexSentenceList = document.getComplexSentenceList();
for (List<IWord> wordList : complexSentenceList) {
for (IWord word : wordList) {
if (word.getLabel().startsWith("nr")) {
dictionaryMaker.add(word);
}
}
}
}
});
dictionaryMaker.saveTxtTo("data/dictionary/custom/人名词典.txt");
}
use of com.hankcs.hanlp.corpus.document.Document in project HanLP by hankcs.
the class AdjustCorpus method testPlay.
public void testPlay() throws Exception {
final TFDictionary tfDictionary = new TFDictionary();
CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {
@Override
public void handle(Document document) {
for (List<IWord> wordList : document.getComplexSentenceList()) {
for (IWord word : wordList) {
if (word instanceof CompoundWord && word.getLabel().equals("ns")) {
tfDictionary.add(word.toString());
}
}
}
}
});
tfDictionary.saveTxtTo("data/test/complex_ns.txt");
}
Aggregations