use of org.icij.extract.extractor.UpdatableDigester in project datashare by ICIJ.
the class SourceExtractorTest method test_get_source_for_embedded_doc_without_metadata.
@Test
public void test_get_source_for_embedded_doc_without_metadata() throws Exception {
DocumentFactory tikaFactory = new DocumentFactory().configure(Options.from(new HashMap<String, String>() {
{
put("idDigestMethod", Document.HASHER.toString());
}
}));
Path path = get(getClass().getResource("/docs/embedded_doc.eml").getPath());
Extractor extractor = new Extractor(tikaFactory);
extractor.setDigester(new UpdatableDigester(TEST_INDEX, Document.HASHER.toString()));
final TikaDocument document = extractor.extract(path);
ElasticsearchSpewer spewer = new ElasticsearchSpewer(es.client, l -> Language.ENGLISH, new FieldNames(), Mockito.mock(Publisher.class), new PropertiesProvider()).withRefresh(IMMEDIATE).withIndex(TEST_INDEX);
spewer.write(document);
Document attachedPdf = new ElasticsearchIndexer(es.client, new PropertiesProvider()).get(TEST_INDEX, "1bf2b6aa27dd8b45c7db58875004b8cb27a78ced5200b4976b63e351ebbae5ececb86076d90e156a7cdea06cde9573ca", "f4078910c3e73a192e3a82d205f3c0bdb749c4e7b23c1d05a622db0f07d7f0ededb335abdb62aef41ace5d3cdb9298bc");
InputStream source = new SourceExtractor(true).getSource(project(TEST_INDEX), attachedPdf);
assertThat(source).isNotNull();
assertThat(getBytes(source).length).isNotEqualTo(49779);
}
use of org.icij.extract.extractor.UpdatableDigester in project datashare by ICIJ.
the class SourceExtractor method getSource.
public InputStream getSource(final Project project, final Document document) throws FileNotFoundException {
if (document.isRootDocument()) {
if (filterMetadata) {
try {
return new ByteArrayInputStream(metadataCleaner.clean(new FileInputStream(document.getPath().toFile())).getContent());
} catch (IOException e) {
throw new ExtractException("content cleaner error ", e);
}
} else {
return new FileInputStream(document.getPath().toFile());
}
} else {
LOGGER.info("extracting embedded document " + Identifier.shorten(document.getId(), 4) + " from root document " + document.getPath());
TikaDocumentSource source;
EmbeddedDocumentMemoryExtractor embeddedExtractor;
DigestIdentifier identifier;
if (document.getId().length() == SHA_384.digestLength) {
embeddedExtractor = new EmbeddedDocumentMemoryExtractor(new UpdatableDigester(project.getId(), SHA_384.toString()));
identifier = new DigestIdentifier(SHA_384.toString(), Charset.defaultCharset());
} else {
// backward compatibility
Hasher hasher = Hasher.valueOf(document.getId().length());
embeddedExtractor = new EmbeddedDocumentMemoryExtractor(new CommonsDigester(20 * 1024 * 1024, hasher.toString().replace("-", "")), hasher.toString(), false);
identifier = new DigestIdentifier(hasher.toString(), Charset.defaultCharset());
}
TikaDocument rootDocument = new DocumentFactory().withIdentifier(identifier).create(document.getPath());
try {
source = embeddedExtractor.extract(rootDocument, document.getId());
return filterMetadata ? new ByteArrayInputStream(metadataCleaner.clean(new ByteArrayInputStream(source.content)).getContent()) : new ByteArrayInputStream(source.content);
} catch (SAXException | TikaException | IOException e) {
throw new ExtractException("extract error for embedded document " + document.getId(), e);
}
}
}
Aggregations