use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.
the class Document method getSimpleSentenceList.
/**
* 获取简单的句子列表,其中复合词的标签如果是set中指定的话会被拆分为简单词
* @param labelSet
* @return
*/
public List<List<Word>> getSimpleSentenceList(Set<String> labelSet) {
List<List<Word>> simpleList = new LinkedList<List<Word>>();
for (Sentence sentence : sentenceList) {
List<Word> wordList = new LinkedList<Word>();
for (IWord word : sentence.wordList) {
if (word instanceof CompoundWord) {
if (labelSet.contains(word.getLabel())) {
for (Word inner : ((CompoundWord) word).innerList) {
wordList.add(inner);
}
} else {
wordList.add(((CompoundWord) word).toWord());
}
} else {
wordList.add((Word) word);
}
}
simpleList.add(wordList);
}
return simpleList;
}
use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.
the class Sentence method create.
public static Sentence create(String param) {
Pattern pattern = Pattern.compile("(\\[(([^\\s]+/[0-9a-zA-Z]+)\\s+)+?([^\\s]+/[0-9a-zA-Z]+)]/[0-9a-zA-Z]+)|([^\\s]+/[0-9a-zA-Z]+)");
Matcher matcher = pattern.matcher(param);
List<IWord> wordList = new LinkedList<IWord>();
while (matcher.find()) {
String single = matcher.group();
IWord word = WordFactory.create(single);
if (word == null) {
logger.warning("在用" + single + "构造单词时失败");
return null;
}
wordList.add(word);
}
return new Sentence(wordList);
}
use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.
the class Sentence method toString.
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
int i = 1;
for (IWord word : wordList) {
sb.append(word);
if (i != wordList.size())
sb.append(' ');
++i;
}
return sb.toString();
}
use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.
the class TestCorpusLoader method testMakeOrganizationCustomDictionary.
public void testMakeOrganizationCustomDictionary() throws Exception {
final DictionaryMaker dictionaryMaker = new DictionaryMaker();
CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {
@Override
public void handle(Document document) {
List<List<IWord>> complexSentenceList = document.getComplexSentenceList();
for (List<IWord> wordList : complexSentenceList) {
for (IWord word : wordList) {
if (word.getLabel().startsWith("nt")) {
dictionaryMaker.add(word);
}
}
}
}
});
dictionaryMaker.saveTxtTo("data/dictionary/custom/机构名词典.txt");
}
use of com.hankcs.hanlp.corpus.document.sentence.word.IWord in project HanLP by hankcs.
the class TestCorpusLoader method testMakePersonCustomDictionary.
public void testMakePersonCustomDictionary() throws Exception {
final DictionaryMaker dictionaryMaker = new DictionaryMaker();
CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {
@Override
public void handle(Document document) {
List<List<IWord>> complexSentenceList = document.getComplexSentenceList();
for (List<IWord> wordList : complexSentenceList) {
for (IWord word : wordList) {
if (word.getLabel().startsWith("nr")) {
dictionaryMaker.add(word);
}
}
}
}
});
dictionaryMaker.saveTxtTo("data/dictionary/custom/人名词典.txt");
}
Aggregations