Search in sources :

Example 51 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class NlpConsumer method findNamedEntities.

void findNamedEntities(final String projectName, final String id, final String routing) throws InterruptedException {
    try {
        Document doc = indexer.get(projectName, id, routing);
        if (doc != null) {
            logger.info("extracting {} entities for document {}", nlpPipeline.getType(), doc.getId());
            if (nlpPipeline.initialize(doc.getLanguage())) {
                int nbEntities = 0;
                if (doc.getContent().length() < this.maxContentLengthChars) {
                    List<NamedEntity> namedEntities = nlpPipeline.process(doc);
                    indexer.bulkAdd(projectName, nlpPipeline.getType(), namedEntities, doc);
                    nbEntities = namedEntities.size();
                } else {
                    int nbChunks = doc.getContent().length() / this.maxContentLengthChars + 1;
                    logger.info("document is too large, extracting entities for {} document chunks", nbChunks);
                    for (int chunkIndex = 0; chunkIndex < nbChunks; chunkIndex++) {
                        List<NamedEntity> namedEntities = nlpPipeline.process(doc, maxContentLengthChars, chunkIndex * maxContentLengthChars);
                        if (chunkIndex < nbChunks - 1) {
                            indexer.bulkAdd(projectName, namedEntities);
                        } else {
                            indexer.bulkAdd(projectName, nlpPipeline.getType(), namedEntities, doc);
                        }
                        nbEntities += namedEntities.size();
                    }
                }
                logger.info("added {} named entities to document {}", nbEntities, doc.getId());
                nlpPipeline.terminate(doc.getLanguage());
            }
        } else {
            logger.warn("no document found in index with id " + id);
        }
    } catch (IOException e) {
        logger.error("cannot extract entities of doc " + id, e);
    }
}
Also used : NamedEntity(org.icij.datashare.text.NamedEntity) IOException(java.io.IOException) Document(org.icij.datashare.text.Document)

Example 52 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class BatchDownloadRunnerTest method test_max_zip_size.

@Test
public void test_max_zip_size() throws Exception {
    Document[] documents = IntStream.range(0, 3).mapToObj(i -> createDoc("doc" + i).with(createFile(i)).with("hello world " + i).build()).toArray(Document[]::new);
    mockSearch.willReturn(2, documents);
    File zip = new BatchDownloadRunner(indexer, new PropertiesProvider(new HashMap<String, String>() {

        {
            put(BATCH_DOWNLOAD_MAX_SIZE, valueOf("hello world 1".getBytes(StandardCharsets.UTF_8).length * 3));
            put(SCROLL_SIZE, "3");
        }
    }), new BatchDownload(project("test-datashare"), User.local(), "query"), updater).call();
    assertThat(new ZipFile(zip).size()).isEqualTo(4);
}
Also used : IntStream(java.util.stream.IntStream) MockitoAnnotations.initMocks(org.mockito.MockitoAnnotations.initMocks) Mock(org.mockito.Mock) HashMap(java.util.HashMap) Function(java.util.function.Function) Assertions.assertThat(org.fest.assertions.Assertions.assertThat) DatashareCliOptions(org.icij.datashare.cli.DatashareCliOptions) ElasticsearchStatusException(org.elasticsearch.ElasticsearchStatusException) User(org.icij.datashare.user.User) ZipFile(java.util.zip.ZipFile) Project.project(org.icij.datashare.text.Project.project) BatchDownload(org.icij.datashare.batch.BatchDownload) Path(java.nio.file.Path) Before(org.junit.Before) PropertiesProvider(org.icij.datashare.PropertiesProvider) Files(java.nio.file.Files) DocumentBuilder.createDoc(org.icij.datashare.text.DocumentBuilder.createDoc) Indexer(org.icij.datashare.text.indexing.Indexer) Test(org.junit.Test) IOException(java.io.IOException) Document(org.icij.datashare.text.Document) File(java.io.File) StandardCharsets(java.nio.charset.StandardCharsets) Rule(org.junit.Rule) String.valueOf(java.lang.String.valueOf) RestStatus(org.elasticsearch.rest.RestStatus) TemporaryFolder(org.junit.rules.TemporaryFolder) PropertiesProvider(org.icij.datashare.PropertiesProvider) BatchDownload(org.icij.datashare.batch.BatchDownload) ZipFile(java.util.zip.ZipFile) Document(org.icij.datashare.text.Document) ZipFile(java.util.zip.ZipFile) File(java.io.File) Test(org.junit.Test)

Example 53 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class BenchDocument method testReadsAndWrites.

@Test
public void testReadsAndWrites() {
    int nbDocs = 100;
    int nbNes = 100;
    LinkedList<String> neIds = new LinkedList<>();
    logger.info("writing {} documents with {} named entities", nbDocs, nbNes);
    long beginTime = System.currentTimeMillis();
    for (int docIdx = 0; docIdx < nbDocs; docIdx++) {
        Document document = new Document(project("prj"), Paths.get("/foo/bar_" + docIdx + ".txt"), "This is a content with Gael Giraud " + docIdx, Language.FRENCH, Charset.defaultCharset(), "text/plain", new HashMap<String, Object>() {

            {
                put("key1", "value1");
                put("key2", "value2");
                put("key3", "value3");
                put("key4", "value4");
                put("key5", "value5");
                put("key6", "value6");
                put("key7", "value7");
                put("key8", "value8");
                put("key9", "value9");
                put("key10", "value10");
            }
        }, Document.Status.INDEXED, 345L);
        repository.create(document);
        List<NamedEntity> neList = new ArrayList<>();
        for (int neIdx = 0; neIdx < nbNes; neIdx++) {
            NamedEntity ne = NamedEntity.create(NamedEntity.Category.PERSON, "Gael Giraud" + neIdx, Arrays.asList(23L), document.getId(), "root", Pipeline.Type.CORENLP, Language.FRENCH);
            neIds.add(ne.getId());
            neList.add(ne);
        }
        repository.create(neList);
        if (docIdx % 10 == 0) {
            logger.info("wrote {} docs", docIdx);
        }
    }
    long endTime = System.currentTimeMillis();
    logger.info("done in {}ms", endTime - beginTime);
    logger.info("reading " + neIds.size() + " NamedEntities");
    beginTime = System.currentTimeMillis();
    for (String neId : neIds) {
        repository.getNamedEntity(neId);
    }
    endTime = System.currentTimeMillis();
    logger.info("done in {}ms", endTime - beginTime);
}
Also used : NamedEntity(org.icij.datashare.text.NamedEntity) Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Example 54 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class BatchDownloadRunnerTest method test_max_default_results.

@Test
public void test_max_default_results() throws Exception {
    Document[] documents = IntStream.range(0, 3).mapToObj(i -> createDoc("doc" + i).with(createFile(i)).build()).toArray(Document[]::new);
    mockSearch.willReturn(2, documents);
    File zip = new BatchDownloadRunner(indexer, new PropertiesProvider(new HashMap<String, String>() {

        {
            put(BATCH_DOWNLOAD_MAX_NB_FILES, "3");
            put(SCROLL_SIZE, "3");
        }
    }), new BatchDownload(project("test-datashare"), User.local(), "query"), updater).call();
    assertThat(new ZipFile(zip).size()).isEqualTo(3);
}
Also used : IntStream(java.util.stream.IntStream) MockitoAnnotations.initMocks(org.mockito.MockitoAnnotations.initMocks) Mock(org.mockito.Mock) HashMap(java.util.HashMap) Function(java.util.function.Function) Assertions.assertThat(org.fest.assertions.Assertions.assertThat) DatashareCliOptions(org.icij.datashare.cli.DatashareCliOptions) ElasticsearchStatusException(org.elasticsearch.ElasticsearchStatusException) User(org.icij.datashare.user.User) ZipFile(java.util.zip.ZipFile) Project.project(org.icij.datashare.text.Project.project) BatchDownload(org.icij.datashare.batch.BatchDownload) Path(java.nio.file.Path) Before(org.junit.Before) PropertiesProvider(org.icij.datashare.PropertiesProvider) Files(java.nio.file.Files) DocumentBuilder.createDoc(org.icij.datashare.text.DocumentBuilder.createDoc) Indexer(org.icij.datashare.text.indexing.Indexer) Test(org.junit.Test) IOException(java.io.IOException) Document(org.icij.datashare.text.Document) File(java.io.File) StandardCharsets(java.nio.charset.StandardCharsets) Rule(org.junit.Rule) String.valueOf(java.lang.String.valueOf) RestStatus(org.elasticsearch.rest.RestStatus) TemporaryFolder(org.junit.rules.TemporaryFolder) PropertiesProvider(org.icij.datashare.PropertiesProvider) BatchDownload(org.icij.datashare.batch.BatchDownload) ZipFile(java.util.zip.ZipFile) Document(org.icij.datashare.text.Document) ZipFile(java.util.zip.ZipFile) File(java.io.File) Test(org.junit.Test)

Example 55 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class BatchSearchRunnerIntTest method test_search_with_paths_ok.

@Test
public void test_search_with_paths_ok() throws Exception {
    Document mydoc = createDoc("mydoc").build();
    indexer.add(TEST_INDEX, mydoc);
    BatchSearch searchOk = new BatchSearch(project(TEST_INDEX), "name", "desc", asSet("mydoc"), User.local(), false, null, singletonList("/path/to"), 0);
    new BatchSearchRunner(indexer, new PropertiesProvider(), searchOk, resultConsumer).call();
    verify(resultConsumer).apply(searchOk.uuid, "mydoc", singletonList(mydoc));
}
Also used : PropertiesProvider(org.icij.datashare.PropertiesProvider) BatchSearch(org.icij.datashare.batch.BatchSearch) Document(org.icij.datashare.text.Document)

Aggregations

Document (org.icij.datashare.text.Document)63 Test (org.junit.Test)48 PropertiesProvider (org.icij.datashare.PropertiesProvider)19 BatchSearch (org.icij.datashare.batch.BatchSearch)15 NamedEntity (org.icij.datashare.text.NamedEntity)11 TikaDocument (org.icij.extract.document.TikaDocument)10 HashMap (java.util.HashMap)9 Path (java.nio.file.Path)6 Date (java.util.Date)5 Indexer (org.icij.datashare.text.indexing.Indexer)5 File (java.io.File)4 IOException (java.io.IOException)4 InputStream (java.io.InputStream)4 IntStream (java.util.stream.IntStream)4 DocumentBuilder.createDoc (org.icij.datashare.text.DocumentBuilder.createDoc)4 Project.project (org.icij.datashare.text.Project.project)4 User (org.icij.datashare.user.User)4 Rule (org.junit.Rule)4 Arrays.asList (java.util.Arrays.asList)3 List (java.util.List)3