use of org.icij.datashare.PropertiesProvider in project datashare by ICIJ.
the class SourceExtractorTest method test_get_source_for_embedded_doc.
@Test
public void test_get_source_for_embedded_doc() throws Exception {
DocumentFactory tikaFactory = new DocumentFactory().configure(Options.from(new HashMap<String, String>() {
{
put("idDigestMethod", Document.HASHER.toString());
}
}));
Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
Extractor extractor = new Extractor(tikaFactory);
extractor.setDigester(new UpdatableDigester(TEST_INDEX, Document.HASHER.toString()));
final TikaDocument document = extractor.extract(path);
ElasticsearchSpewer spewer = new ElasticsearchSpewer(es.client, l -> Language.ENGLISH, new FieldNames(), Mockito.mock(Publisher.class), new PropertiesProvider()).withRefresh(IMMEDIATE).withIndex(TEST_INDEX);
spewer.write(document);
Document attachedPdf = new ElasticsearchIndexer(es.client, new PropertiesProvider()).get(TEST_INDEX, "1bf2b6aa27dd8b45c7db58875004b8cb27a78ced5200b4976b63e351ebbae5ececb86076d90e156a7cdea06cde9573ca", "f4078910c3e73a192e3a82d205f3c0bdb749c4e7b23c1d05a622db0f07d7f0ededb335abdb62aef41ace5d3cdb9298bc");
assertThat(attachedPdf).isNotNull();
assertThat(attachedPdf.getContentType()).isEqualTo("application/pdf");
InputStream source = new SourceExtractor().getSource(project(TEST_INDEX), attachedPdf);
assertThat(source).isNotNull();
assertThat(getBytes(source)).hasSize(49779);
}
use of org.icij.datashare.PropertiesProvider in project datashare by ICIJ.
the class SourceExtractorTest method test_get_source_for_embedded_doc_without_metadata.
@Test
public void test_get_source_for_embedded_doc_without_metadata() throws Exception {
DocumentFactory tikaFactory = new DocumentFactory().configure(Options.from(new HashMap<String, String>() {
{
put("idDigestMethod", Document.HASHER.toString());
}
}));
Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
Extractor extractor = new Extractor(tikaFactory);
extractor.setDigester(new UpdatableDigester(TEST_INDEX, Document.HASHER.toString()));
final TikaDocument document = extractor.extract(path);
ElasticsearchSpewer spewer = new ElasticsearchSpewer(es.client, l -> Language.ENGLISH, new FieldNames(), Mockito.mock(Publisher.class), new PropertiesProvider()).withRefresh(IMMEDIATE).withIndex(TEST_INDEX);
spewer.write(document);
Document attachedPdf = new ElasticsearchIndexer(es.client, new PropertiesProvider()).get(TEST_INDEX, "1bf2b6aa27dd8b45c7db58875004b8cb27a78ced5200b4976b63e351ebbae5ececb86076d90e156a7cdea06cde9573ca", "f4078910c3e73a192e3a82d205f3c0bdb749c4e7b23c1d05a622db0f07d7f0ededb335abdb62aef41ace5d3cdb9298bc");
InputStream source = new SourceExtractor(true).getSource(project(TEST_INDEX), attachedPdf);
assertThat(source).isNotNull();
assertThat(getBytes(source).length).isNotEqualTo(49779);
}
use of org.icij.datashare.PropertiesProvider in project datashare by ICIJ.
the class CommonMode method configure.
@Override
protected void configure() {
bind(PropertiesProvider.class).toInstance(propertiesProvider);
bind(LanguageGuesser.class).to(OptimaizeLanguageGuesser.class);
String batchQueueType = propertiesProvider.get("batchQueueType").orElse("org.icij.datashare.extract.MemoryBlockingQueue");
bind(new TypeLiteral<BlockingQueue<String>>() {
}).toInstance(getBlockingQueue(propertiesProvider, batchQueueType, "ds:batchsearch:queue"));
bind(new TypeLiteral<BlockingQueue<BatchDownload>>() {
}).toInstance(getBlockingQueue(propertiesProvider, batchQueueType, "ds:batchdownload:queue"));
RestHighLevelClient esClient = createESClient(propertiesProvider);
bind(RestHighLevelClient.class).toInstance(esClient);
bind(Indexer.class).to(ElasticsearchIndexer.class).asEagerSingleton();
bind(TaskManagerMemory.class).toInstance(new TaskManagerMemory(propertiesProvider));
install(new FactoryModuleBuilder().build(TaskFactory.class));
if ("memory".equals(propertiesProvider.getProperties().get("queueType"))) {
bind(DocumentCollectionFactory.class).to(MemoryDocumentCollectionFactory.class).asEagerSingleton();
} else {
install(new FactoryModuleBuilder().implement(DocumentQueue.class, RedisUserDocumentQueue.class).implement(ReportMap.class, RedisUserReportMap.class).build(DocumentCollectionFactory.class));
}
DataBus dataBus;
if ("memory".equals(propertiesProvider.getProperties().get("busType"))) {
dataBus = new MemoryDataBus();
} else {
dataBus = new RedisDataBus(propertiesProvider);
}
bind(DataBus.class).toInstance(dataBus);
bind(Publisher.class).toInstance(dataBus);
PipelineRegistry pipelineRegistry = new PipelineRegistry(propertiesProvider);
pipelineRegistry.register(EmailPipeline.class);
pipelineRegistry.register(Pipeline.Type.CORENLP);
try {
pipelineRegistry.load();
} catch (FileNotFoundException e) {
LoggerFactory.getLogger(getClass()).info("extensions dir not found " + e.getMessage());
}
bind(PipelineRegistry.class).toInstance(pipelineRegistry);
}
use of org.icij.datashare.PropertiesProvider in project datashare by ICIJ.
the class ElasticsearchConfigurationTest method test_create_client_creates_settings.
@Test
public void test_create_client_creates_settings() throws Exception {
ElasticsearchConfiguration.createESClient(new PropertiesProvider());
Response response = es.client.getLowLevelClient().performRequest(new Request("GET", TEST_INDEX));
assertThat(EntityUtils.toString(response.getEntity())).contains("settings");
}
use of org.icij.datashare.PropertiesProvider in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_truncated_content_if_document_is_smaller_than_limit.
@Test
public void test_truncated_content_if_document_is_smaller_than_limit() throws Exception {
ElasticsearchSpewer limitedContentSpewer = new ElasticsearchSpewer(es.client, text -> Language.ENGLISH, new FieldNames(), publisher, new PropertiesProvider(new HashMap<String, String>() {
{
put("maxContentLength", "20");
}
})).withRefresh(IMMEDIATE).withIndex("test-datashare");
final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("ok-file.txt"));
final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("this content is ok".getBytes()));
document.setReader(reader);
limitedContentSpewer.write(document);
GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
assertThat(documentFields.getSourceAsMap()).includes(entry("content", "this content is ok"));
}
Aggregations