use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.
the class EmailPipeline method process.
@Override
public List<NamedEntity> process(Document doc, int contentLength, int contentOffset) {
Matcher matcher = pattern.matcher(doc.getContent().substring(contentOffset, Math.min(contentLength + contentOffset, doc.getContentTextLength())));
NamedEntitiesBuilder namedEntitiesBuilder = new NamedEntitiesBuilder(EMAIL, doc.getId(), doc.getLanguage()).withRoot(doc.getRootDocument());
while (matcher.find()) {
String email = matcher.group(0);
int start = matcher.start();
namedEntitiesBuilder.add(NamedEntity.Category.EMAIL, email, start + contentOffset);
}
if ("message/rfc822".equals(doc.getContentType())) {
String metadataString = parsedEmailHeaders.stream().map(key -> doc.getMetadata().getOrDefault(key, "").toString()).collect(joining(" "));
Matcher metaMatcher = pattern.matcher(metadataString);
while (metaMatcher.find()) {
namedEntitiesBuilder.add(NamedEntity.Category.EMAIL, metaMatcher.group(0), -1);
}
}
return namedEntitiesBuilder.build();
}
use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.
the class NlpConsumer method findNamedEntities.
void findNamedEntities(final String projectName, final String id, final String routing) throws InterruptedException {
try {
Document doc = indexer.get(projectName, id, routing);
if (doc != null) {
logger.info("extracting {} entities for document {}", nlpPipeline.getType(), doc.getId());
if (nlpPipeline.initialize(doc.getLanguage())) {
int nbEntities = 0;
if (doc.getContent().length() < this.maxContentLengthChars) {
List<NamedEntity> namedEntities = nlpPipeline.process(doc);
indexer.bulkAdd(projectName, nlpPipeline.getType(), namedEntities, doc);
nbEntities = namedEntities.size();
} else {
int nbChunks = doc.getContent().length() / this.maxContentLengthChars + 1;
logger.info("document is too large, extracting entities for {} document chunks", nbChunks);
for (int chunkIndex = 0; chunkIndex < nbChunks; chunkIndex++) {
List<NamedEntity> namedEntities = nlpPipeline.process(doc, maxContentLengthChars, chunkIndex * maxContentLengthChars);
if (chunkIndex < nbChunks - 1) {
indexer.bulkAdd(projectName, namedEntities);
} else {
indexer.bulkAdd(projectName, nlpPipeline.getType(), namedEntities, doc);
}
nbEntities += namedEntities.size();
}
}
logger.info("added {} named entities to document {}", nbEntities, doc.getId());
nlpPipeline.terminate(doc.getLanguage());
}
} else {
logger.warn("no document found in index with id " + id);
}
} catch (IOException e) {
logger.error("cannot extract entities of doc " + id, e);
}
}
use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.
the class BenchDocument method testReadsAndWrites.
@Test
public void testReadsAndWrites() {
int nbDocs = 100;
int nbNes = 100;
LinkedList<String> neIds = new LinkedList<>();
logger.info("writing {} documents with {} named entities", nbDocs, nbNes);
long beginTime = System.currentTimeMillis();
for (int docIdx = 0; docIdx < nbDocs; docIdx++) {
Document document = new Document(project("prj"), Paths.get("/foo/bar_" + docIdx + ".txt"), "This is a content with Gael Giraud " + docIdx, Language.FRENCH, Charset.defaultCharset(), "text/plain", new HashMap<String, Object>() {
{
put("key1", "value1");
put("key2", "value2");
put("key3", "value3");
put("key4", "value4");
put("key5", "value5");
put("key6", "value6");
put("key7", "value7");
put("key8", "value8");
put("key9", "value9");
put("key10", "value10");
}
}, Document.Status.INDEXED, 345L);
repository.create(document);
List<NamedEntity> neList = new ArrayList<>();
for (int neIdx = 0; neIdx < nbNes; neIdx++) {
NamedEntity ne = NamedEntity.create(NamedEntity.Category.PERSON, "Gael Giraud" + neIdx, Arrays.asList(23L), document.getId(), "root", Pipeline.Type.CORENLP, Language.FRENCH);
neIds.add(ne.getId());
neList.add(ne);
}
repository.create(neList);
if (docIdx % 10 == 0) {
logger.info("wrote {} docs", docIdx);
}
}
long endTime = System.currentTimeMillis();
logger.info("done in {}ms", endTime - beginTime);
logger.info("reading " + neIds.size() + " NamedEntities");
beginTime = System.currentTimeMillis();
for (String neId : neIds) {
repository.getNamedEntity(neId);
}
endTime = System.currentTimeMillis();
logger.info("done in {}ms", endTime - beginTime);
}
use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.
the class ElasticsearchIndexer method bulkAdd.
@Override
public boolean bulkAdd(final String indexName, Pipeline.Type nerType, List<NamedEntity> namedEntities, Document parent) throws IOException {
BulkRequest bulkRequest = new BulkRequest();
String routing = ofNullable(parent.getRootDocument()).orElse(parent.getId());
bulkRequest.add(new UpdateRequest(indexName, parent.getId()).doc(jsonBuilder().startObject().field("status", Document.Status.DONE).endObject()).routing(routing));
bulkRequest.add(new UpdateRequest(indexName, parent.getId()).script(new Script(ScriptType.INLINE, "painless", "if (!ctx._source.nerTags.contains(params.nerTag)) ctx._source.nerTags.add(params.nerTag);", new HashMap<String, Object>() {
{
put("nerTag", nerType.toString());
}
})).routing(routing));
for (Entity child : namedEntities) {
bulkRequest.add(createIndexRequest(indexName, JsonObjectMapper.getType(child), child.getId(), getJson(child), parent.getId(), routing));
}
bulkRequest.setRefreshPolicy(esCfg.refreshPolicy);
BulkResponse bulkResponse = client.bulk(bulkRequest, RequestOptions.DEFAULT);
if (bulkResponse.hasFailures()) {
for (BulkItemResponse resp : bulkResponse.getItems()) {
if (resp.isFailed()) {
LOGGER.error("bulk add failed : {}", resp.getFailureMessage());
}
}
return false;
}
return true;
}
use of org.icij.datashare.text.NamedEntity in project datashare by ICIJ.
the class NamedEntityResourceTest method test_get_named_entity_in_prod_mode.
@Test
public void test_get_named_entity_in_prod_mode() {
configure(routes -> routes.add(new NamedEntityResource(indexer)).filter(new BasicAuthFilter("/", "icij", DatashareUser.singleUser("anne"))));
NamedEntity toBeReturned = create(PERSON, "mention", asList(123L), "docId", "root", CORENLP, FRENCH);
doReturn(toBeReturned).when(indexer).get("anne-datashare", "my_id", "root_parent");
get("/api/anne-datashare/namedEntities/my_id?routing=root_parent").withAuthentication("anne", "notused").should().respond(200).haveType("application/json");
}
Aggregations