use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class NlpConsumer method findNamedEntities.
void findNamedEntities(final String projectName, final String id, final String routing) throws InterruptedException {
try {
Document doc = indexer.get(projectName, id, routing);
if (doc != null) {
logger.info("extracting {} entities for document {}", nlpPipeline.getType(), doc.getId());
if (nlpPipeline.initialize(doc.getLanguage())) {
int nbEntities = 0;
if (doc.getContent().length() < this.maxContentLengthChars) {
List<NamedEntity> namedEntities = nlpPipeline.process(doc);
indexer.bulkAdd(projectName, nlpPipeline.getType(), namedEntities, doc);
nbEntities = namedEntities.size();
} else {
int nbChunks = doc.getContent().length() / this.maxContentLengthChars + 1;
logger.info("document is too large, extracting entities for {} document chunks", nbChunks);
for (int chunkIndex = 0; chunkIndex < nbChunks; chunkIndex++) {
List<NamedEntity> namedEntities = nlpPipeline.process(doc, maxContentLengthChars, chunkIndex * maxContentLengthChars);
if (chunkIndex < nbChunks - 1) {
indexer.bulkAdd(projectName, namedEntities);
} else {
indexer.bulkAdd(projectName, nlpPipeline.getType(), namedEntities, doc);
}
nbEntities += namedEntities.size();
}
}
logger.info("added {} named entities to document {}", nbEntities, doc.getId());
nlpPipeline.terminate(doc.getLanguage());
}
} else {
logger.warn("no document found in index with id " + id);
}
} catch (IOException e) {
logger.error("cannot extract entities of doc " + id, e);
}
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class BatchDownloadRunnerTest method test_max_zip_size.
@Test
public void test_max_zip_size() throws Exception {
Document[] documents = IntStream.range(0, 3).mapToObj(i -> createDoc("doc" + i).with(createFile(i)).with("hello world " + i).build()).toArray(Document[]::new);
mockSearch.willReturn(2, documents);
File zip = new BatchDownloadRunner(indexer, new PropertiesProvider(new HashMap<String, String>() {
{
put(BATCH_DOWNLOAD_MAX_SIZE, valueOf("hello world 1".getBytes(StandardCharsets.UTF_8).length * 3));
put(SCROLL_SIZE, "3");
}
}), new BatchDownload(project("test-datashare"), User.local(), "query"), updater).call();
assertThat(new ZipFile(zip).size()).isEqualTo(4);
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class BenchDocument method testReadsAndWrites.
@Test
public void testReadsAndWrites() {
int nbDocs = 100;
int nbNes = 100;
LinkedList<String> neIds = new LinkedList<>();
logger.info("writing {} documents with {} named entities", nbDocs, nbNes);
long beginTime = System.currentTimeMillis();
for (int docIdx = 0; docIdx < nbDocs; docIdx++) {
Document document = new Document(project("prj"), Paths.get("/foo/bar_" + docIdx + ".txt"), "This is a content with Gael Giraud " + docIdx, Language.FRENCH, Charset.defaultCharset(), "text/plain", new HashMap<String, Object>() {
{
put("key1", "value1");
put("key2", "value2");
put("key3", "value3");
put("key4", "value4");
put("key5", "value5");
put("key6", "value6");
put("key7", "value7");
put("key8", "value8");
put("key9", "value9");
put("key10", "value10");
}
}, Document.Status.INDEXED, 345L);
repository.create(document);
List<NamedEntity> neList = new ArrayList<>();
for (int neIdx = 0; neIdx < nbNes; neIdx++) {
NamedEntity ne = NamedEntity.create(NamedEntity.Category.PERSON, "Gael Giraud" + neIdx, Arrays.asList(23L), document.getId(), "root", Pipeline.Type.CORENLP, Language.FRENCH);
neIds.add(ne.getId());
neList.add(ne);
}
repository.create(neList);
if (docIdx % 10 == 0) {
logger.info("wrote {} docs", docIdx);
}
}
long endTime = System.currentTimeMillis();
logger.info("done in {}ms", endTime - beginTime);
logger.info("reading " + neIds.size() + " NamedEntities");
beginTime = System.currentTimeMillis();
for (String neId : neIds) {
repository.getNamedEntity(neId);
}
endTime = System.currentTimeMillis();
logger.info("done in {}ms", endTime - beginTime);
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class BatchDownloadRunnerTest method test_max_default_results.
@Test
public void test_max_default_results() throws Exception {
Document[] documents = IntStream.range(0, 3).mapToObj(i -> createDoc("doc" + i).with(createFile(i)).build()).toArray(Document[]::new);
mockSearch.willReturn(2, documents);
File zip = new BatchDownloadRunner(indexer, new PropertiesProvider(new HashMap<String, String>() {
{
put(BATCH_DOWNLOAD_MAX_NB_FILES, "3");
put(SCROLL_SIZE, "3");
}
}), new BatchDownload(project("test-datashare"), User.local(), "query"), updater).call();
assertThat(new ZipFile(zip).size()).isEqualTo(3);
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class BatchSearchRunnerIntTest method test_search_with_paths_ok.
@Test
public void test_search_with_paths_ok() throws Exception {
Document mydoc = createDoc("mydoc").build();
indexer.add(TEST_INDEX, mydoc);
BatchSearch searchOk = new BatchSearch(project(TEST_INDEX), "name", "desc", asSet("mydoc"), User.local(), false, null, singletonList("/path/to"), 0);
new BatchSearchRunner(indexer, new PropertiesProvider(), searchOk, resultConsumer).call();
verify(resultConsumer).apply(searchOk.uuid, "mydoc", singletonList(mydoc));
}
Aggregations