use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class ElasticsearchIndexerTest method test_bulk_add_with_root_document.
@Test
public void test_bulk_add_with_root_document() throws IOException {
Document root = createDoc("root").build();
assertThat(indexer.bulkAdd(TEST_INDEX, asList(createDoc("doc1").withRootId(root.getId()).build(), createDoc("doc2").withRootId(root.getId()).build()))).isTrue();
assertThat(((Document) indexer.get(TEST_INDEX, "doc1")).getRootDocument()).isEqualTo(root.getId());
assertThat(((Document) indexer.get(TEST_INDEX, "doc2")).getRootDocument()).isEqualTo(root.getId());
assertThat(es.client.get(new GetRequest(TEST_INDEX, "doc1"), RequestOptions.DEFAULT).getFields().get("_routing").getValues()).isEqualTo(asList(root.getId()));
assertThat(es.client.get(new GetRequest(TEST_INDEX, "doc1"), RequestOptions.DEFAULT).getFields().get("_routing").getValues()).isEqualTo(asList(root.getId()));
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class ElasticsearchIndexerTest method test_bulk_add_named_entities.
@Test
public void test_bulk_add_named_entities() throws IOException {
Document doc = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("doc.txt"), "content", Language.FRENCH, Charset.defaultCharset(), "application/pdf", new HashMap<>(), INDEXED, new HashSet<>(), 4324L);
indexer.add(TEST_INDEX, doc);
NamedEntity ne1 = create(PERSON, "John Doe", asList(12L), "doc.txt", "root", CORENLP, Language.FRENCH);
NamedEntity ne2 = create(ORGANIZATION, "AAA", asList(123L), "doc.txt", "root", CORENLP, Language.FRENCH);
assertThat(indexer.bulkAdd(TEST_INDEX, CORENLP, asList(ne1, ne2), doc)).isTrue();
assertThat(((Document) indexer.get(TEST_INDEX, doc.getId())).getStatus()).isEqualTo(Document.Status.DONE);
assertThat(((Document) indexer.get(TEST_INDEX, doc.getId())).getNerTags()).containsOnly(CORENLP);
assertThat((NamedEntity) indexer.get(TEST_INDEX, ne1.getId(), doc.getId())).isNotNull();
assertThat((NamedEntity) indexer.get(TEST_INDEX, ne2.getId(), doc.getId())).isNotNull();
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class JooqRepositoryTest method test_create_document.
@Test
public void test_create_document() throws Exception {
Document document = new Document("id", project("prj"), Paths.get("/path/to/doc"), "content", FRENCH, Charset.defaultCharset(), "text/plain", new HashMap<String, Object>() {
{
put("key 1", "value 1");
put("key 2", "value 2");
}
}, Document.Status.INDEXED, Pipeline.set(CORENLP, OPENNLP), 432L);
repository.create(document);
Document actual = repository.getDocument(document.getId());
assertThat(actual).isEqualTo(document);
assertThat(actual.getMetadata()).isEqualTo(document.getMetadata());
assertThat(actual.getNerTags()).isEqualTo(document.getNerTags());
assertThat(actual.getExtractionDate()).isEqualTo(document.getExtractionDate());
assertThat(actual.getProject()).isEqualTo(project("prj"));
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class BatchSearchRunner method call.
@Override
public Integer call() throws SearchException {
int numberOfResults = 0;
int throttleMs = parseInt(propertiesProvider.get(BATCH_THROTTLE).orElse("0"));
int maxTimeSeconds = parseInt(propertiesProvider.get(BATCH_SEARCH_MAX_TIME).orElse("100000"));
int scrollSize = min(parseInt(propertiesProvider.get(SCROLL_SIZE).orElse("1000")), MAX_SCROLL_SIZE);
callThread = Thread.currentThread();
// for tests
callWaiterLatch.countDown();
logger.info("running {} queries for batch search {} on project {} with throttle {}ms and scroll size of {}", batchSearch.queries.size(), batchSearch.uuid, batchSearch.project, throttleMs, scrollSize);
String query = null;
try {
for (String s : batchSearch.queries.keySet()) {
query = s;
Indexer.Searcher searcher = indexer.search(batchSearch.project.getId(), Document.class).with(query, batchSearch.fuzziness, batchSearch.phraseMatches).withFieldValues("contentType", batchSearch.fileTypes.toArray(new String[] {})).withPrefixQuery("dirname", batchSearch.paths.toArray(new String[] {})).withoutSource("content").limit(scrollSize);
List<? extends Entity> docsToProcess = searcher.scroll().collect(toList());
long beforeScrollLoop = DatashareTime.getInstance().currentTimeMillis();
while (docsToProcess.size() != 0 && numberOfResults < MAX_BATCH_RESULT_SIZE - MAX_SCROLL_SIZE) {
if (cancelAsked) {
throw new CancelException();
}
resultConsumer.apply(batchSearch.uuid, query, (List<Document>) docsToProcess);
if (DatashareTime.getInstance().currentTimeMillis() - beforeScrollLoop < maxTimeSeconds * 1000) {
DatashareTime.getInstance().sleep(throttleMs);
} else {
throw new SearchException(query, new TimeoutException("Batch timed out after " + maxTimeSeconds + "s"));
}
numberOfResults += docsToProcess.size();
docsToProcess = searcher.scroll().collect(toList());
}
searcher.clearScroll();
totalProcessed += 1;
}
} catch (ElasticsearchStatusException esEx) {
throw new SearchException(query, stream(esEx.getSuppressed()).filter(t -> t instanceof ResponseException).findFirst().orElse(esEx));
} catch (IOException | InterruptedException ex) {
throw new SearchException(query, ex);
}
logger.info("done batch search {} with success", batchSearch.uuid);
return numberOfResults;
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class EmailPipeline method process.
@Override
public List<NamedEntity> process(Document doc, int contentLength, int contentOffset) {
Matcher matcher = pattern.matcher(doc.getContent().substring(contentOffset, Math.min(contentLength + contentOffset, doc.getContentTextLength())));
NamedEntitiesBuilder namedEntitiesBuilder = new NamedEntitiesBuilder(EMAIL, doc.getId(), doc.getLanguage()).withRoot(doc.getRootDocument());
while (matcher.find()) {
String email = matcher.group(0);
int start = matcher.start();
namedEntitiesBuilder.add(NamedEntity.Category.EMAIL, email, start + contentOffset);
}
if ("message/rfc822".equals(doc.getContentType())) {
String metadataString = parsedEmailHeaders.stream().map(key -> doc.getMetadata().getOrDefault(key, "").toString()).collect(joining(" "));
Matcher metaMatcher = pattern.matcher(metadataString);
while (metaMatcher.find()) {
namedEntitiesBuilder.add(NamedEntity.Category.EMAIL, metaMatcher.group(0), -1);
}
}
return namedEntitiesBuilder.build();
}
Aggregations