use of org.icij.extract.document.TikaDocument in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_truncated_content.
@Test
public void test_truncated_content() throws Exception {
ElasticsearchSpewer limitedContentSpewer = new ElasticsearchSpewer(es.client, text -> Language.ENGLISH, new FieldNames(), publisher, new PropertiesProvider(new HashMap<String, String>() {
{
put("maxContentLength", "20");
}
})).withRefresh(IMMEDIATE).withIndex("test-datashare");
final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("fake-file.txt"));
final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("this content should be truncated".getBytes()));
document.setReader(reader);
limitedContentSpewer.write(document);
GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
assertThat(documentFields.getSourceAsMap()).includes(entry("content", "this content should"));
}
use of org.icij.extract.document.TikaDocument in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_metadata.
@Test
public void test_metadata() throws Exception {
Path path = get(Objects.requireNonNull(getClass().getResource("/docs/a/b/c/doc.txt")).getPath());
TikaDocument document = new Extractor().extract(path);
spewer.write(document);
GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
assertThat(documentFields.getSourceAsMap()).includes(entry("contentEncoding", "ISO-8859-1"), entry("contentType", "text/plain"), entry("nerTags", new ArrayList<>()), entry("contentLength", 45), entry("status", "INDEXED"), entry("path", path.toString()), entry("dirname", path.getParent().toString()));
}
use of org.icij.extract.document.TikaDocument in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_extract_id_should_be_equal_to_datashare_id.
@Test
public void test_extract_id_should_be_equal_to_datashare_id() throws IOException {
DocumentFactory tikaFactory = new DocumentFactory().configure(Options.from(new HashMap<String, String>() {
{
put("idDigestMethod", Document.HASHER.toString());
}
}));
Extractor extractor = new Extractor(tikaFactory);
extractor.setDigester(new UpdatableDigester("project", Document.HASHER.toString()));
final TikaDocument extractDocument = extractor.extract(get(Objects.requireNonNull(getClass().getResource("/docs/embedded_doc.eml")).getPath()));
Document document = new Document(Project.project("project"), get(Objects.requireNonNull(getClass().getResource("/docs/embedded_doc.eml")).getPath()), "This is a document to be parsed by datashare.", Language.FRENCH, Charset.defaultCharset(), "text/plain", convert(extractDocument.getMetadata()), Document.Status.INDEXED, 45L);
assertThat(document.getId()).isEqualTo(extractDocument.getId());
}
use of org.icij.extract.document.TikaDocument in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_long_content_length.
@Test
public void test_long_content_length() throws Exception {
final TikaDocument document = new DocumentFactory().withIdentifier(new PathIdentifier()).create(get("t-file.txt"));
final ParsingReader reader = new ParsingReader(new ByteArrayInputStream("test".getBytes()));
document.setReader(reader);
document.getMetadata().set("Content-Length", "7862117376");
spewer.write(document);
GetResponse documentFields = es.client.get(new GetRequest(TEST_INDEX, document.getId()), RequestOptions.DEFAULT);
assertThat(documentFields.getSourceAsMap()).includes(entry("contentLength", 7862117376L));
}
use of org.icij.extract.document.TikaDocument in project datashare by ICIJ.
the class DatabaseSpewer method writeDocument.
@Override
protected void writeDocument(TikaDocument tikaDocument, TikaDocument parent, TikaDocument root, int level) throws IOException {
String content = toString(tikaDocument.getReader()).trim();
Charset charset = Charset.forName(ofNullable(tikaDocument.getMetadata().get(CONTENT_ENCODING)).orElse("utf-8"));
String contentType = ofNullable(tikaDocument.getMetadata().get(CONTENT_TYPE)).orElse(DEFAULT_VALUE_UNKNOWN).split(";")[0];
Long contentLength = valueOf(ofNullable(tikaDocument.getMetadata().get(CONTENT_LENGTH)).orElse("-1"));
String parentId = parent == null ? null : parent.getId();
String rootId = root == null ? null : root.getId();
Document document = new Document(project, tikaDocument.getId(), tikaDocument.getPath(), content, languageGuesser.guess(content), charset, contentType, getMetadata(tikaDocument), Document.Status.INDEXED, new HashSet<>(), new Date(), parentId, rootId, (short) level, contentLength);
repository.create(document);
}
Aggregations