use of org.icij.extract.document.PathIdentifier in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_truncated_content.
@Test
public void test_truncated_content() throws Exception {
ElasticsearchSpewer limitedContentSpewer = new ElasticsearchSpewer(es.client, text -> Language.ENGLISH, new FieldNames(), publisher, new PropertiesProvider(new HashMap<String, String>() {
{
put("maxContentLength", "20");
}
})).withRefresh(IMMEDIATE).withIndex("test-datashare");
final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("fake-file.txt"));
final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("this content should be truncated".getBytes()));
document.setReader(reader);
limitedContentSpewer.write(document);
GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
assertThat(documentFields.getSourceAsMap()).includes(entry("content", "this content should"));
}
use of org.icij.extract.document.PathIdentifier in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_long_content_length.
@Test
public void test_long_content_length() throws Exception {
final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("t-file.txt"));
final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("test".getBytes()));
document.setReader(reader);
document.getMetadata().set("Content-Length", "7862117376");
spewer.write(document);
GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
assertThat(documentFields.getSourceAsMap()).includes(entry("contentLength", 7862117376L));
}
use of org.icij.extract.document.PathIdentifier in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_simple_write.
@Test
public void test_simple_write() throws Exception {
final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("test-file.txt"));
final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("test".getBytes()));
document.setReader(reader);
spewer.write(document);
GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
assertThat(documentFields.isExists()).isTrue();
assertThat(documentFields.getId()).isEqualTo(document.getId());
assertEquals(new HashMap<String, String>() {
{
put("name", "Document");
}
}, documentFields.getSourceAsMap().get("join"));
ArgumentCaptor<Message> argument = ArgumentCaptor.forClass(Message.class);
verify(publisher).publish(eq(Channel.NLP), argument.capture());
assertThat(argument.getValue().content).includes(entry(Field.DOC_ID, document.getId()));
}
use of org.icij.extract.document.PathIdentifier in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_truncated_content_if_document_is_smaller_than_limit.
@Test
public void test_truncated_content_if_document_is_smaller_than_limit() throws Exception {
ElasticsearchSpewer limitedContentSpewer = new ElasticsearchSpewer(es.client, text -> Language.ENGLISH, new FieldNames(), publisher, new PropertiesProvider(new HashMap<String, String>() {
{
put("maxContentLength", "20");
}
})).withRefresh(IMMEDIATE).withIndex("test-datashare");
final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("ok-file.txt"));
final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("this content is ok".getBytes()));
document.setReader(reader);
limitedContentSpewer.write(document);
GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
assertThat(documentFields.getSourceAsMap()).includes(entry("content", "this content is ok"));
}
Aggregations