use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class BenchBatchSearch method testReadsAndWrites.
@Test
public void testReadsAndWrites() {
int nbBatchSearches = 100;
int nbQueries = 1000;
int nbResultsPerQuery = 10;
logger.info("writing {} batch searches with {} queries and {} results per query", nbBatchSearches, nbQueries, nbResultsPerQuery);
long beginTime = System.currentTimeMillis();
for (int bsIdx = 0; bsIdx < nbBatchSearches; bsIdx++) {
String[] queries = IntStream.range(0, nbQueries).mapToObj(i -> "query " + i).toArray(String[]::new);
BatchSearch batch = new BatchSearch(project("test"), "name" + bsIdx, "desc" + bsIdx, asSet(queries), User.local());
repository.save(batch);
for (String q : queries) {
List<Document> documents = IntStream.range(0, nbResultsPerQuery).mapToObj(i -> createDoc("doc" + i).build()).collect(Collectors.toList());
repository.saveResults(batch.uuid, q, documents);
}
if (bsIdx % 2 == 0) {
logger.info("wrote {} batches", bsIdx);
}
}
long endTime = System.currentTimeMillis();
logger.info("done in {}ms", endTime - beginTime);
logger.info("reading batch searches");
beginTime = System.currentTimeMillis();
// repository.get(User.local());
endTime = System.currentTimeMillis();
logger.info("done in {}ms", endTime - beginTime);
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class DatabaseSpewerTest method test_spew_document_iso8859_encoded_is_stored_in_utf8_and_have_correct_parameters.
@Test
public void test_spew_document_iso8859_encoded_is_stored_in_utf8_and_have_correct_parameters() throws Exception {
File file = tmp.newFile("test_iso8859-1.txt");
Files.write(file.toPath(), singletonList("chaîne en iso8859"), forName("ISO-8859-1"));
TikaDocument tikaDocument = new Extractor().extract(file.toPath());
dbSpewer.write(tikaDocument);
Document actual = dbSpewer.repository.getDocument(tikaDocument.getId());
assertThat(actual.getContent()).isEqualTo("chaîne en iso8859");
assertThat(actual.getContentEncoding()).isEqualTo(forName("iso8859-1"));
assertThat(actual.getContentLength()).isEqualTo(18);
assertThat(actual.getContentType()).isEqualTo("text/plain");
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class JooqRepositoryTest method test_get_untagged_documents.
@Test
public void test_get_untagged_documents() throws Exception {
Document coreAndOpenNlp = new Document("idCore", project("prj"), Paths.get("/path/to/coreAndOpenNlp"), "coreAndOpenNlp", FRENCH, Charset.defaultCharset(), "text/plain", new HashMap<>(), Document.Status.INDEXED, Pipeline.set(CORENLP, OPENNLP), 432L);
Document ixaPipe = new Document("idIxa", project("prj"), Paths.get("/path/to/ixaPipe"), "ixaPipe", FRENCH, Charset.defaultCharset(), "text/plain", new HashMap<>(), Document.Status.INDEXED, Pipeline.set(IXAPIPE), 234L);
repository.create(coreAndOpenNlp);
repository.create(ixaPipe);
assertThat(repository.getDocumentsNotTaggedWithPipeline(project("prj"), IXAPIPE)).containsExactly(coreAndOpenNlp);
assertThat(repository.getDocumentsNotTaggedWithPipeline(project("prj"), CORENLP)).containsExactly(ixaPipe);
assertThat(repository.getDocumentsNotTaggedWithPipeline(project("prj"), MITIE)).containsExactly(coreAndOpenNlp, ixaPipe);
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class ElasticsearchSpewerTest method test_extract_id_should_be_equal_to_datashare_id.
@Test
public void test_extract_id_should_be_equal_to_datashare_id() throws IOException {
DocumentFactory tikaFactory = new DocumentFactory().configure(Options.from(new HashMap<String, String>() {
{
put("idDigestMethod", Document.HASHER.toString());
}
}));
Extractor extractor = new Extractor(tikaFactory);
extractor.setDigester(new UpdatableDigester("project", Document.HASHER.toString()));
final TikaDocument extractDocument = extractor.extract(get(Objects.requireNonNull(getClass().getResource("/docs/embedded_doc.eml")).getPath()));
Document document = new Document(Project.project("project"), get(Objects.requireNonNull(getClass().getResource("/docs/embedded_doc.eml")).getPath()), "This is a document to be parsed by datashare.", Language.FRENCH, Charset.defaultCharset(), "text/plain", convert(extractDocument.getMetadata()), Document.Status.INDEXED, 45L);
assertThat(document.getId()).isEqualTo(extractDocument.getId());
}
use of org.icij.datashare.text.Document in project datashare by ICIJ.
the class NerResourceTest method test_post_empty_text.
@Test
public void test_post_empty_text() throws Exception {
Document doc = DocumentBuilder.createDoc("inline").with("").with(ENGLISH).build();
doReturn(emptyList()).when(pipeline).process(eq(doc));
post("/api/ner/findNames/CORENLP", doc.getContent()).should().respond(200).contain("[]");
verify(pipeline).initialize(ENGLISH);
verify(pipeline).process(doc);
}
Aggregations