use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class IndexerHelper method indexFile.
File indexFile(String fileName, String content, TemporaryFolder fs) throws IOException {
String[] pathItems = fileName.split("/");
File folder = pathItems.length > 1 ? fs.newFolder(Arrays.copyOf(pathItems, pathItems.length - 1)) : fs.getRoot();
File file = folder.toPath().resolve(pathItems[pathItems.length - 1]).toFile();
file.createNewFile();
Files.write(file.toPath(), content.getBytes(StandardCharsets.UTF_8));
String docname = FilenameUtils.removeExtension(FilenameUtils.getName(fileName));
Document my_doc = DocumentBuilder.createDoc(docname).with(content).with(file.toPath()).build();
indexer.add(ElasticsearchRule.TEST_INDEX, my_doc);
return file;
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class EmailPipelineTest method test_emails_chunked_content.
@Test
public void test_emails_chunked_content() {
Document document = createDocument("this is a content with email@domain.com\n" + "and another one : foo@bar.com\n" + "and baz@qux.fr", "docId", Language.ENGLISH);
List<NamedEntity> annotations = pipeline.process(document, 20, 72);
assertThat(annotations).hasSize(1);
assertThat(annotations.get(0).getMention()).isEqualTo("baz@qux.fr");
assertThat(annotations.get(0).getOffsets()).containsExactly(74L);
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class EmailPipelineTest method test_filter_headers_that_contains_mail_addresses.
@Test
public void test_filter_headers_that_contains_mail_addresses() {
Document doc = createDoc("docid").with("mail content").ofMimeType("message/rfc822").with(new HashMap<String, Object>() {
{
put(tikaRawHeader("field"), "email@domain.com");
put(tikaRawHeader("Message-ID"), "id@domain.com");
put(tikaRawHeader("Return-Path"), "return@head.er");
put(tikaMsgHeader("To"), "to@head.er");
put(tikaMsgHeader("From"), "from@head.er");
put(tikaMsgHeader("Cc"), "cc@head.er");
put(tikaMsgHeader("Bcc"), "bcc@head.er");
put(tika("Dc-Title"), "subject@head.er");
put(tikaRawHeader("Reply-To"), "replyto@head.er");
put(tikaRawHeader("Followup-To"), "followup@head.er");
put(tikaRawHeader("Alternate-Recipient"), "alternate@head.er");
put(tikaRawHeader("For-Handling"), "forhandling@head.er");
put(tikaRawHeader("Resent-Reply-To"), "resent-replyto@head.er");
put(tikaRawHeader("Resent-Sender"), "resent-sender@head.er");
put(tikaRawHeader("Resent-From"), "resent-from@head.er");
put(tikaRawHeader("Resent-To"), "resent-to@head.er");
put(tikaRawHeader("Resent-cc"), "resent-cc@head.er");
put(tikaRawHeader("Resent-bcc"), "resent-bcc@head.er");
}
}).build();
List<NamedEntity> namedEntities = pipeline.process(doc);
assertThat(namedEntities).containsExactly(NamedEntity.create(EMAIL, "replyto@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "alternate@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-sender@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "cc@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "from@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-cc@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "forhandling@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-replyto@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "return@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-to@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "followup@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-bcc@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "subject@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "bcc@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "to@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "resent-from@head.er", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH));
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class EmailPipelineTest method test_adds_document_headers_parsing_for_email.
@Test
public void test_adds_document_headers_parsing_for_email() {
Document doc = createDoc("docid").with("hello@world.com").ofMimeType("message/rfc822").with(new HashMap<String, Object>() {
{
put(tikaMsgHeader("To"), "email1@domain.com");
put(tikaMsgHeader("Cc"), "email2@domain.com");
}
}).build();
List<NamedEntity> namedEntities = pipeline.process(doc);
assertThat(namedEntities).containsExactly(NamedEntity.create(EMAIL, "hello@world.com", asList(0L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "email2@domain.com", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH), NamedEntity.create(EMAIL, "email1@domain.com", asList(-1L), "docid", "root", Pipeline.Type.EMAIL, FRENCH));
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class NlpConsumerTest method test_on_message_processNLP__when_doc_found_in_index.
@Test
public void test_on_message_processNLP__when_doc_found_in_index() throws Exception {
when(pipeline.initialize(any())).thenReturn(true);
Document doc = createDoc("content").build();
when(pipeline.process(doc)).thenReturn(emptyList());
when(indexer.get("projectName", doc.getId(), "routing")).thenReturn(doc);
nlpListener.findNamedEntities("projectName", doc.getId(), "routing");
verify(pipeline).initialize(ENGLISH);
verify(pipeline).process(doc);
}
Aggregations