use of org.icij.datashare.text.nlp.Pipeline.Type.EMAIL in project datashare by ICIJ.
the class EmailPipeline method process.
@Override
public List<NamedEntity> process(Document doc, int contentLength, int contentOffset) {
Matcher matcher = pattern.matcher(doc.getContent().substring(contentOffset, Math.min(contentLength + contentOffset, doc.getContentTextLength())));
NamedEntitiesBuilder namedEntitiesBuilder = new NamedEntitiesBuilder(EMAIL, doc.getId(), doc.getLanguage()).withRoot(doc.getRootDocument());
while (matcher.find()) {
String email = matcher.group(0);
int start = matcher.start();
namedEntitiesBuilder.add(NamedEntity.Category.EMAIL, email, start + contentOffset);
}
if ("message/rfc822".equals(doc.getContentType())) {
String metadataString = parsedEmailHeaders.stream().map(key -> doc.getMetadata().getOrDefault(key, "").toString()).collect(joining(" "));
Matcher metaMatcher = pattern.matcher(metadataString);
while (metaMatcher.find()) {
namedEntitiesBuilder.add(NamedEntity.Category.EMAIL, metaMatcher.group(0), -1);
}
}
return namedEntitiesBuilder.build();
}
Aggregations