use of edu.stanford.nlp.ie.AbstractSequenceClassifier in project datashare by ICIJ.
the class CorenlpPipeline method processNerClassifier.
/**
* Named Entity Classifier (Conditional Random Fields) only
*
* @param doc the document
*/
private List<NamedEntity> processNerClassifier(Document doc, int contentLength, int contentOffset) throws InterruptedException {
Annotations annotations = new Annotations(doc.getId(), doc.getRootDocument(), getType(), doc.getLanguage());
NamedEntitiesBuilder namedEntitiesBuilder = new NamedEntitiesBuilder(getType(), doc.getId(), doc.getLanguage()).withRoot(doc.getRootDocument());
LOGGER.info("name-finding for {} in document {} (offset {})", doc.getLanguage(), doc.getId(), contentOffset);
// Recognize named entities from input
final CoreNlpAnnotator<AbstractSequenceClassifier<CoreLabel>> abstractSequenceClassifierCoreNlpAnnotator;
abstractSequenceClassifierCoreNlpAnnotator = CoreNlpNerModels.getInstance().get(doc.getLanguage());
String chunk = doc.getContent().substring(contentOffset, Math.min(contentOffset + contentLength, doc.getContentTextLength()));
List<Triple<String, Integer, Integer>> items = abstractSequenceClassifierCoreNlpAnnotator.annotator.classifyToCharacterOffsets(chunk);
// For each recognized named entity
for (Triple<String, Integer, Integer> item : items) {
// Triple: <category, begin, end>
NamedEntity.Category category = NamedEntity.Category.parse(item.first());
int begin = item.second();
int end = item.third();
String mention = ThrowingFunctions.removeNewLines.apply(chunk.substring(begin, end));
namedEntitiesBuilder.add(category, mention, begin + contentOffset);
}
return namedEntitiesBuilder.build();
}
Aggregations