use of org.icij.datashare.text.NamedEntitiesBuilder in project datashare by ICIJ.
the class CorenlpPipeline method processNerClassifier.
/**
* Named Entity Classifier (Conditional Random Fields) only
*
* @param doc the document
*/
private List<NamedEntity> processNerClassifier(Document doc, int contentLength, int contentOffset) throws InterruptedException {
Annotations annotations = new Annotations(doc.getId(), doc.getRootDocument(), getType(), doc.getLanguage());
NamedEntitiesBuilder namedEntitiesBuilder = new NamedEntitiesBuilder(getType(), doc.getId(), doc.getLanguage()).withRoot(doc.getRootDocument());
LOGGER.info("name-finding for {} in document {} (offset {})", doc.getLanguage(), doc.getId(), contentOffset);
// Recognize named entities from input
final CoreNlpAnnotator<AbstractSequenceClassifier<CoreLabel>> abstractSequenceClassifierCoreNlpAnnotator;
abstractSequenceClassifierCoreNlpAnnotator = CoreNlpNerModels.getInstance().get(doc.getLanguage());
String chunk = doc.getContent().substring(contentOffset, Math.min(contentOffset + contentLength, doc.getContentTextLength()));
List<Triple<String, Integer, Integer>> items = abstractSequenceClassifierCoreNlpAnnotator.annotator.classifyToCharacterOffsets(chunk);
// For each recognized named entity
for (Triple<String, Integer, Integer> item : items) {
// Triple: <category, begin, end>
NamedEntity.Category category = NamedEntity.Category.parse(item.first());
int begin = item.second();
int end = item.third();
String mention = ThrowingFunctions.removeNewLines.apply(chunk.substring(begin, end));
namedEntitiesBuilder.add(category, mention, begin + contentOffset);
}
return namedEntitiesBuilder.build();
}
use of org.icij.datashare.text.NamedEntitiesBuilder in project datashare by ICIJ.
the class EmailPipeline method process.
@Override
public List<NamedEntity> process(Document doc, int contentLength, int contentOffset) {
Matcher matcher = pattern.matcher(doc.getContent().substring(contentOffset, Math.min(contentLength + contentOffset, doc.getContentTextLength())));
NamedEntitiesBuilder namedEntitiesBuilder = new NamedEntitiesBuilder(EMAIL, doc.getId(), doc.getLanguage()).withRoot(doc.getRootDocument());
while (matcher.find()) {
String email = matcher.group(0);
int start = matcher.start();
namedEntitiesBuilder.add(NamedEntity.Category.EMAIL, email, start + contentOffset);
}
if ("message/rfc822".equals(doc.getContentType())) {
String metadataString = parsedEmailHeaders.stream().map(key -> doc.getMetadata().getOrDefault(key, "").toString()).collect(joining(" "));
Matcher metaMatcher = pattern.matcher(metadataString);
while (metaMatcher.find()) {
namedEntitiesBuilder.add(NamedEntity.Category.EMAIL, metaMatcher.group(0), -1);
}
}
return namedEntitiesBuilder.build();
}
Aggregations