use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.
the class Document method getSimpleSentenceList.
/**
* 获取简单的句子列表,其中复合词会被拆分为简单词
* @return
*/
public List<List<Word>> getSimpleSentenceList() {
List<List<Word>> simpleList = new LinkedList<List<Word>>();
for (Sentence sentence : sentenceList) {
List<Word> wordList = new LinkedList<Word>();
for (IWord word : sentence.wordList) {
if (word instanceof CompoundWord) {
for (Word inner : ((CompoundWord) word).innerList) {
wordList.add(inner);
}
} else {
wordList.add((Word) word);
}
}
simpleList.add(wordList);
}
return simpleList;
}
use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.
the class Document method getSimpleSentenceList.
/**
* 获取简单的句子列表
* @param spilt 如果为真,其中复合词会被拆分为简单词
* @return
*/
public List<List<Word>> getSimpleSentenceList(boolean spilt) {
List<List<Word>> simpleList = new LinkedList<List<Word>>();
for (Sentence sentence : sentenceList) {
List<Word> wordList = new LinkedList<Word>();
for (IWord word : sentence.wordList) {
if (word instanceof CompoundWord) {
if (spilt) {
for (Word inner : ((CompoundWord) word).innerList) {
wordList.add(inner);
}
} else {
wordList.add(((CompoundWord) word).toWord());
}
} else {
wordList.add((Word) word);
}
}
simpleList.add(wordList);
}
return simpleList;
}
use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.
the class Document method getSimpleSentenceList.
/**
* 获取简单的句子列表,其中复合词的标签如果是set中指定的话会被拆分为简单词
* @param labelSet
* @return
*/
public List<List<Word>> getSimpleSentenceList(Set<String> labelSet) {
List<List<Word>> simpleList = new LinkedList<List<Word>>();
for (Sentence sentence : sentenceList) {
List<Word> wordList = new LinkedList<Word>();
for (IWord word : sentence.wordList) {
if (word instanceof CompoundWord) {
if (labelSet.contains(word.getLabel())) {
for (Word inner : ((CompoundWord) word).innerList) {
wordList.add(inner);
}
} else {
wordList.add(((CompoundWord) word).toWord());
}
} else {
wordList.add((Word) word);
}
}
simpleList.add(wordList);
}
return simpleList;
}
use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.
the class TestICWB method testDumpPeople2014ToBEMS.
public void testDumpPeople2014ToBEMS() throws Exception {
final BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("D:\\Tools\\CRF++-0.58\\example\\seg_cn\\2014.txt")));
CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler() {
@Override
public void handle(Document document) {
List<List<Word>> simpleSentenceList = document.getSimpleSentenceList();
for (List<Word> wordList : simpleSentenceList) {
try {
for (Word word : wordList) {
bw.write(word.value);
bw.write(' ');
}
bw.newLine();
} catch (IOException e) {
e.printStackTrace();
}
}
}
});
bw.close();
}
use of com.hankcs.hanlp.corpus.document.sentence.word.Word in project HanLP by hankcs.
the class NSDictionaryMaker method roleTag.
@Override
protected void roleTag(List<List<IWord>> sentenceList) {
int i = 0;
for (List<IWord> wordList : sentenceList) {
Precompiler.compileWithoutNS(wordList);
if (verbose) {
System.out.print(++i + " / " + sentenceList.size() + " ");
System.out.println("原始语料 " + wordList);
}
LinkedList<IWord> wordLinkedList = (LinkedList<IWord>) wordList;
wordLinkedList.addFirst(new Word(Predefine.TAG_BIGIN, "S"));
wordLinkedList.addLast(new Word(Predefine.TAG_END, "Z"));
if (verbose)
System.out.println("添加首尾 " + wordList);
// 标注上文
Iterator<IWord> iterator = wordLinkedList.iterator();
IWord pre = iterator.next();
while (iterator.hasNext()) {
IWord current = iterator.next();
if (current.getLabel().startsWith("ns") && !pre.getLabel().startsWith("ns")) {
pre.setLabel(NS.A.toString());
}
pre = current;
}
if (verbose)
System.out.println("标注上文 " + wordList);
// 标注下文
iterator = wordLinkedList.descendingIterator();
pre = iterator.next();
while (iterator.hasNext()) {
IWord current = iterator.next();
if (current.getLabel().startsWith("ns") && !pre.getLabel().startsWith("ns")) {
pre.setLabel(NS.B.toString());
}
pre = current;
}
if (verbose)
System.out.println("标注下文 " + wordList);
// 标注中间
iterator = wordLinkedList.iterator();
IWord first = iterator.next();
IWord second = iterator.next();
while (iterator.hasNext()) {
IWord third = iterator.next();
if (first.getLabel().startsWith("ns") && third.getLabel().startsWith("ns") && !second.getLabel().startsWith("ns")) {
second.setLabel(NS.X.toString());
}
first = second;
second = third;
}
if (verbose)
System.out.println("标注中间 " + wordList);
// 拆分地名
CorpusUtil.spilt(wordList);
if (verbose)
System.out.println("拆分地名 " + wordList);
// 处理整个
ListIterator<IWord> listIterator = wordLinkedList.listIterator();
while (listIterator.hasNext()) {
IWord word = listIterator.next();
String label = word.getLabel();
if (label.equals(label.toUpperCase()))
continue;
if (label.startsWith("ns")) {
String value = word.getValue();
int longestSuffixLength = PlaceSuffixDictionary.dictionary.getLongestSuffixLength(value);
int wordLength = value.length() - longestSuffixLength;
if (longestSuffixLength == 0 || wordLength == 0) {
word.setLabel(NS.G.toString());
continue;
}
listIterator.remove();
if (wordLength > 3) {
listIterator.add(new Word(value.substring(0, wordLength), NS.G.toString()));
listIterator.add(new Word(value.substring(wordLength), NS.H.toString()));
continue;
}
for (int l = 1, tag = NS.C.ordinal(); l <= wordLength; ++l, ++tag) {
listIterator.add(new Word(value.substring(l - 1, l), NS.values()[tag].toString()));
}
listIterator.add(new Word(value.substring(wordLength), NS.H.toString()));
} else {
word.setLabel(NS.Z.toString());
}
}
if (verbose)
System.out.println("处理整个 " + wordList);
}
}
Aggregations