Search in sources :

Example 46 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class ElasticsearchIndexerTest method test_bulk_add_with_root_document.

@Test
public void test_bulk_add_with_root_document() throws IOException {
    Document root = createDoc("root").build();
    assertThat(indexer.bulkAdd(TEST_INDEX, asList(createDoc("doc1").withRootId(root.getId()).build(), createDoc("doc2").withRootId(root.getId()).build()))).isTrue();
    assertThat(((Document) indexer.get(TEST_INDEX, "doc1")).getRootDocument()).isEqualTo(root.getId());
    assertThat(((Document) indexer.get(TEST_INDEX, "doc2")).getRootDocument()).isEqualTo(root.getId());
    assertThat(es.client.get(new GetRequest(TEST_INDEX, "doc1"), RequestOptions.DEFAULT).getFields().get("_routing").getValues()).isEqualTo(asList(root.getId()));
    assertThat(es.client.get(new GetRequest(TEST_INDEX, "doc1"), RequestOptions.DEFAULT).getFields().get("_routing").getValues()).isEqualTo(asList(root.getId()));
}
Also used : GetRequest(org.elasticsearch.action.get.GetRequest) Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Example 47 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class ElasticsearchIndexerTest method test_bulk_add_named_entities.

@Test
public void test_bulk_add_named_entities() throws IOException {
    Document doc = new org.icij.datashare.text.Document("id", project("prj"), Paths.get("doc.txt"), "content", Language.FRENCH, Charset.defaultCharset(), "application/pdf", new HashMap<>(), INDEXED, new HashSet<>(), 4324L);
    indexer.add(TEST_INDEX, doc);
    NamedEntity ne1 = create(PERSON, "John Doe", asList(12L), "doc.txt", "root", CORENLP, Language.FRENCH);
    NamedEntity ne2 = create(ORGANIZATION, "AAA", asList(123L), "doc.txt", "root", CORENLP, Language.FRENCH);
    assertThat(indexer.bulkAdd(TEST_INDEX, CORENLP, asList(ne1, ne2), doc)).isTrue();
    assertThat(((Document) indexer.get(TEST_INDEX, doc.getId())).getStatus()).isEqualTo(Document.Status.DONE);
    assertThat(((Document) indexer.get(TEST_INDEX, doc.getId())).getNerTags()).containsOnly(CORENLP);
    assertThat((NamedEntity) indexer.get(TEST_INDEX, ne1.getId(), doc.getId())).isNotNull();
    assertThat((NamedEntity) indexer.get(TEST_INDEX, ne2.getId(), doc.getId())).isNotNull();
}
Also used : NamedEntity(org.icij.datashare.text.NamedEntity) Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Example 48 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class JooqRepositoryTest method test_create_document.

@Test
public void test_create_document() throws Exception {
    Document document = new Document("id", project("prj"), Paths.get("/path/to/doc"), "content", FRENCH, Charset.defaultCharset(), "text/plain", new HashMap<String, Object>() {

        {
            put("key 1", "value 1");
            put("key 2", "value 2");
        }
    }, Document.Status.INDEXED, Pipeline.set(CORENLP, OPENNLP), 432L);
    repository.create(document);
    Document actual = repository.getDocument(document.getId());
    assertThat(actual).isEqualTo(document);
    assertThat(actual.getMetadata()).isEqualTo(document.getMetadata());
    assertThat(actual.getNerTags()).isEqualTo(document.getNerTags());
    assertThat(actual.getExtractionDate()).isEqualTo(document.getExtractionDate());
    assertThat(actual.getProject()).isEqualTo(project("prj"));
}
Also used : Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Example 49 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class BatchSearchRunner method call.

@Override
public Integer call() throws SearchException {
    int numberOfResults = 0;
    int throttleMs = parseInt(propertiesProvider.get(BATCH_THROTTLE).orElse("0"));
    int maxTimeSeconds = parseInt(propertiesProvider.get(BATCH_SEARCH_MAX_TIME).orElse("100000"));
    int scrollSize = min(parseInt(propertiesProvider.get(SCROLL_SIZE).orElse("1000")), MAX_SCROLL_SIZE);
    callThread = Thread.currentThread();
    // for tests
    callWaiterLatch.countDown();
    logger.info("running {} queries for batch search {} on project {} with throttle {}ms and scroll size of {}", batchSearch.queries.size(), batchSearch.uuid, batchSearch.project, throttleMs, scrollSize);
    String query = null;
    try {
        for (String s : batchSearch.queries.keySet()) {
            query = s;
            Indexer.Searcher searcher = indexer.search(batchSearch.project.getId(), Document.class).with(query, batchSearch.fuzziness, batchSearch.phraseMatches).withFieldValues("contentType", batchSearch.fileTypes.toArray(new String[] {})).withPrefixQuery("dirname", batchSearch.paths.toArray(new String[] {})).withoutSource("content").limit(scrollSize);
            List<? extends Entity> docsToProcess = searcher.scroll().collect(toList());
            long beforeScrollLoop = DatashareTime.getInstance().currentTimeMillis();
            while (docsToProcess.size() != 0 && numberOfResults < MAX_BATCH_RESULT_SIZE - MAX_SCROLL_SIZE) {
                if (cancelAsked) {
                    throw new CancelException();
                }
                resultConsumer.apply(batchSearch.uuid, query, (List<Document>) docsToProcess);
                if (DatashareTime.getInstance().currentTimeMillis() - beforeScrollLoop < maxTimeSeconds * 1000) {
                    DatashareTime.getInstance().sleep(throttleMs);
                } else {
                    throw new SearchException(query, new TimeoutException("Batch timed out after " + maxTimeSeconds + "s"));
                }
                numberOfResults += docsToProcess.size();
                docsToProcess = searcher.scroll().collect(toList());
            }
            searcher.clearScroll();
            totalProcessed += 1;
        }
    } catch (ElasticsearchStatusException esEx) {
        throw new SearchException(query, stream(esEx.getSuppressed()).filter(t -> t instanceof ResponseException).findFirst().orElse(esEx));
    } catch (IOException | InterruptedException ex) {
        throw new SearchException(query, ex);
    }
    logger.info("done batch search {} with success", batchSearch.uuid);
    return numberOfResults;
}
Also used : ResponseException(org.elasticsearch.client.ResponseException) SearchException(org.icij.datashare.batch.SearchException) IOException(java.io.IOException) Document(org.icij.datashare.text.Document) ElasticsearchStatusException(org.elasticsearch.ElasticsearchStatusException) Indexer(org.icij.datashare.text.indexing.Indexer) TimeoutException(java.util.concurrent.TimeoutException)

Example 50 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class EmailPipeline method process.

@Override
public List<NamedEntity> process(Document doc, int contentLength, int contentOffset) {
    Matcher matcher = pattern.matcher(doc.getContent().substring(contentOffset, Math.min(contentLength + contentOffset, doc.getContentTextLength())));
    NamedEntitiesBuilder namedEntitiesBuilder = new NamedEntitiesBuilder(EMAIL, doc.getId(), doc.getLanguage()).withRoot(doc.getRootDocument());
    while (matcher.find()) {
        String email = matcher.group(0);
        int start = matcher.start();
        namedEntitiesBuilder.add(NamedEntity.Category.EMAIL, email, start + contentOffset);
    }
    if ("message/rfc822".equals(doc.getContentType())) {
        String metadataString = parsedEmailHeaders.stream().map(key -> doc.getMetadata().getOrDefault(key, "").toString()).collect(joining(" "));
        Matcher metaMatcher = pattern.matcher(metadataString);
        while (metaMatcher.find()) {
            namedEntitiesBuilder.add(NamedEntity.Category.EMAIL, metaMatcher.group(0), -1);
        }
    }
    return namedEntitiesBuilder.build();
}
Also used : NamedEntitiesBuilder(org.icij.datashare.text.NamedEntitiesBuilder) EMAIL(org.icij.datashare.text.nlp.Pipeline.Type.EMAIL) java.util(java.util) NamedEntity.allFrom(org.icij.datashare.text.NamedEntity.allFrom) AbstractPipeline(org.icij.datashare.text.nlp.AbstractPipeline) PropertiesProvider(org.icij.datashare.PropertiesProvider) Inject(com.google.inject.Inject) Document(org.icij.datashare.text.Document) Collectors.joining(java.util.stream.Collectors.joining) Matcher(java.util.regex.Matcher) Collections.unmodifiableSet(java.util.Collections.unmodifiableSet) Charset(java.nio.charset.Charset) Arrays.asList(java.util.Arrays.asList) Annotations(org.icij.datashare.text.nlp.Annotations) Pattern(java.util.regex.Pattern) Language(org.icij.datashare.text.Language) NlpStage(org.icij.datashare.text.nlp.NlpStage) NamedEntity(org.icij.datashare.text.NamedEntity) Matcher(java.util.regex.Matcher) NamedEntitiesBuilder(org.icij.datashare.text.NamedEntitiesBuilder)

Aggregations

Document (org.icij.datashare.text.Document)63 Test (org.junit.Test)48 PropertiesProvider (org.icij.datashare.PropertiesProvider)19 BatchSearch (org.icij.datashare.batch.BatchSearch)15 NamedEntity (org.icij.datashare.text.NamedEntity)11 TikaDocument (org.icij.extract.document.TikaDocument)10 HashMap (java.util.HashMap)9 Path (java.nio.file.Path)6 Date (java.util.Date)5 Indexer (org.icij.datashare.text.indexing.Indexer)5 File (java.io.File)4 IOException (java.io.IOException)4 InputStream (java.io.InputStream)4 IntStream (java.util.stream.IntStream)4 DocumentBuilder.createDoc (org.icij.datashare.text.DocumentBuilder.createDoc)4 Project.project (org.icij.datashare.text.Project.project)4 User (org.icij.datashare.user.User)4 Rule (org.junit.Rule)4 Arrays.asList (java.util.Arrays.asList)3 List (java.util.List)3