Search in sources :

Example 26 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class IndexerHelper method indexFile.

File indexFile(String fileName, String content, TemporaryFolder fs) throws IOException {
    String[] pathItems = fileName.split("/");
    File folder = pathItems.length > 1 ? fs.newFolder(Arrays.copyOf(pathItems, pathItems.length - 1)) : fs.getRoot();
    File file = folder.toPath().resolve(pathItems[pathItems.length - 1]).toFile();
    file.createNewFile();
    Files.write(file.toPath(), content.getBytes(StandardCharsets.UTF_8));
    String docname = FilenameUtils.removeExtension(FilenameUtils.getName(fileName));
    Document my_doc = DocumentBuilder.createDoc(docname).with(content).with(file.toPath()).build();
    indexer.add(ElasticsearchRule.TEST_INDEX, my_doc);
    return file;
}
Also used : TikaDocument(org.icij.extract.document.TikaDocument) Document(org.icij.datashare.text.Document) File(java.io.File)

Example 27 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class EmailPipelineTest method test_emails_chunked_content.

@Test
public void test_emails_chunked_content() {
    Document document = createDocument("this is a content with email@domain.com\n" + "and another one : foo@bar.com\n" + "and baz@qux.fr", "docId", Language.ENGLISH);
    List<NamedEntity> annotations = pipeline.process(document, 20, 72);
    assertThat(annotations).hasSize(1);
    assertThat(annotations.get(0).getMention()).isEqualTo("baz@qux.fr");
    assertThat(annotations.get(0).getOffsets()).containsExactly(74L);
}
Also used : NamedEntity(org.icij.datashare.text.NamedEntity) Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Example 28 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class EmailPipelineTest method test_filter_headers_that_contains_mail_addresses.

@Test
public void test_filter_headers_that_contains_mail_addresses() {
    Document doc = createDoc("docid").with("mail content").ofMimeType("message/rfc822").with(new HashMap<String, Object>() {

        {
            put(tikaRawHeader("field"), "email@domain.com");
            put(tikaRawHeader("Message-ID"), "id@domain.com");
            put(tikaRawHeader("Return-Path"), "return@head.er");
            put(tikaMsgHeader("To"), "to@head.er");
            put(tikaMsgHeader("From"), "from@head.er");
            put(tikaMsgHeader("Cc"), "cc@head.er");
            put(tikaMsgHeader("Bcc"), "bcc@head.er");
            put(tika("Dc-Title"), "subject@head.er");
            put(tikaRawHeader("Reply-To"), "replyto@head.er");
            put(tikaRawHeader("Followup-To"), "followup@head.er");
            put(tikaRawHeader("Alternate-Recipient"), "alternate@head.er");
            put(tikaRawHeader("For-Handling"), "forhandling@head.er");
            put(tikaRawHeader("Resent-Reply-To"), "resent-replyto@head.er");
            put(tikaRawHeader("Resent-Sender"), "resent-sender@head.er");
            put(tikaRawHeader("Resent-From"), "resent-from@head.er");
            put(tikaRawHeader("Resent-To"), "resent-to@head.er");
            put(tikaRawHeader("Resent-cc"), "resent-cc@head.er");
            put(tikaRawHeader("Resent-bcc"), "resent-bcc@head.er");
        }
    }).build();
    List<NamedEntity> namedEntities = pipeline.process(doc);
    assertThat(namedEntities).containsExactly(NamedEntity.create(EMAIL, "replyto@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "alternate@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-sender@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "cc@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "from@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-cc@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "forhandling@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-replyto@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "return@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-to@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "followup@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-bcc@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "subject@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "bcc@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "to@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-from@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH));
}
Also used : NamedEntity(org.icij.datashare.text.NamedEntity) HashMap(java.util.HashMap) Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Example 29 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class EmailPipelineTest method test_adds_document_headers_parsing_for_email.

@Test
public void test_adds_document_headers_parsing_for_email() {
    Document doc = createDoc("docid").with("hello@world.com").ofMimeType("message/rfc822").with(new HashMap<String, Object>() {

        {
            put(tikaMsgHeader("To"), "email1@domain.com");
            put(tikaMsgHeader("Cc"), "email2@domain.com");
        }
    }).build();
    List<NamedEntity> namedEntities = pipeline.process(doc);
    assertThat(namedEntities).containsExactly(NamedEntity.create(EMAIL, "hello@world.com", asList(0L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "email2@domain.com", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "email1@domain.com", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH));
}
Also used : NamedEntity(org.icij.datashare.text.NamedEntity) HashMap(java.util.HashMap) Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Example 30 with Document

use of org.icij.datashare.text.Document in project datashare by ICIJ.

the class NlpConsumerTest method test_on_message_processNLP__when_doc_found_in_index.

@Test
public void test_on_message_processNLP__when_doc_found_in_index() throws Exception {
    when(pipeline.initialize(any())).thenReturn(true);
    Document doc = createDoc("content").build();
    when(pipeline.process(doc)).thenReturn(emptyList());
    when(indexer.get("projectName", doc.getId(), "routing")).thenReturn(doc);
    nlpListener.findNamedEntities("projectName", doc.getId(), "routing");
    verify(pipeline).initialize(ENGLISH);
    verify(pipeline).process(doc);
}
Also used : Document(org.icij.datashare.text.Document) Test(org.junit.Test)

Aggregations

Document (org.icij.datashare.text.Document)63 Test (org.junit.Test)48 PropertiesProvider (org.icij.datashare.PropertiesProvider)19 BatchSearch (org.icij.datashare.batch.BatchSearch)15 NamedEntity (org.icij.datashare.text.NamedEntity)11 TikaDocument (org.icij.extract.document.TikaDocument)10 HashMap (java.util.HashMap)9 Path (java.nio.file.Path)6 Date (java.util.Date)5 Indexer (org.icij.datashare.text.indexing.Indexer)5 File (java.io.File)4 IOException (java.io.IOException)4 InputStream (java.io.InputStream)4 IntStream (java.util.stream.IntStream)4 DocumentBuilder.createDoc (org.icij.datashare.text.DocumentBuilder.createDoc)4 Project.project (org.icij.datashare.text.Project.project)4 User (org.icij.datashare.user.User)4 Rule (org.junit.Rule)4 Arrays.asList (java.util.Arrays.asList)3 List (java.util.List)3