use of edu.stanford.nlp.process.DocumentPreprocessor in project textdb by TextDB.
the class NlpSplitOperator method computeSentenceList.
private List<Span> computeSentenceList(Tuple inputTuple) {
String inputText = inputTuple.<IField>getField(predicate.getInputAttributeName()).getValue().toString();
Reader reader = new StringReader(inputText);
DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(reader);
documentPreprocessor.setTokenizerFactory(PTBTokenizer.PTBTokenizerFactory.newCoreLabelTokenizerFactory("ptb3Escaping=false"));
List<Span> sentenceList = new ArrayList<Span>();
int start = 0;
int end = 0;
String key = PropertyNameConstants.NLP_SPLIT_KEY;
String attributeName = predicate.getInputAttributeName();
for (List<HasWord> sentence : documentPreprocessor) {
String sentenceText = SentenceUtils.listToString(sentence);
// Make span
end = start + sentenceText.length();
Span span = new Span(attributeName, start, end, key, sentenceText);
sentenceList.add(span);
start = end + 1;
}
return sentenceList;
}
Aggregations