use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class OOXMLParserTest method testProtectedExcelFile.
/**
* An excel document which is password protected.
* See TIKA-437.
*/
@Test
public void testProtectedExcelFile() throws Exception {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
try (InputStream input = getTestDocument("protectedFile.xlsx")) {
parser.parse(input, handler, metadata, context);
assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
String content = handler.toString();
assertContains("Office", content);
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class TesseractOCRParserTest method runOCR.
private String runOCR(String resource, String[] nonOCRContains, int numMetadatas, BasicContentHandlerFactory.HANDLER_TYPE handlerType, TesseractOCRConfig.OUTPUT_TYPE outputType) throws Exception {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setOutputType(outputType);
Parser parser = new RecursiveParserWrapper(new AutoDetectParser(), new BasicContentHandlerFactory(handlerType, -1));
PDFParserConfig pdfConfig = new PDFParserConfig();
pdfConfig.setExtractInlineImages(true);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
parseContext.set(Parser.class, parser);
parseContext.set(PDFParserConfig.class, pdfConfig);
try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) {
parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext);
}
List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata();
assertEquals(numMetadatas, metadataList.size());
StringBuilder contents = new StringBuilder();
for (Metadata m : metadataList) {
contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
}
for (String needle : nonOCRContains) {
assertContains(needle, contents.toString());
}
assertTrue(metadataList.get(0).names().length > 10);
assertTrue(metadataList.get(1).names().length > 10);
//test at least one value
assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
return contents.toString();
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class ArParserTest method testEmbedded.
/**
* Tests that the ParseContext parser is correctly fired for all the
* embedded entries.
*/
@Test
public void testEmbedded() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = ArParserTest.class.getResourceAsStream("/test-documents/testARofText.ar")) {
parser.parse(stream, handler, metadata, trackingContext);
}
assertEquals(1, tracker.filenames.size());
assertEquals(1, tracker.mediatypes.size());
assertEquals(1, tracker.modifiedAts.size());
assertEquals("testTXT.txt", tracker.filenames.get(0));
String modifiedAt = tracker.modifiedAts.get(0);
assertTrue("Modified at " + modifiedAt, modifiedAt.startsWith("201"));
for (String type : tracker.mediatypes) {
assertNull(type);
}
for (String crt : tracker.createdAts) {
assertNull(crt);
}
tracker.reset();
try (InputStream stream = ArParserTest.class.getResourceAsStream("/test-documents/testARofSND.ar")) {
parser.parse(stream, handler, metadata, trackingContext);
}
assertEquals(1, tracker.filenames.size());
assertEquals(1, tracker.mediatypes.size());
assertEquals(1, tracker.modifiedAts.size());
assertEquals("testAU.au", tracker.filenames.get(0));
modifiedAt = tracker.modifiedAts.get(0);
assertTrue("Modified at " + modifiedAt, modifiedAt.startsWith("201"));
for (String type : tracker.mediatypes) {
assertNull(type);
}
for (String crt : tracker.createdAts) {
assertNull(crt);
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class Bzip2ParserTest method testEmbedded.
/**
* Tests that the ParseContext parser is correctly
* fired for all the embedded entries.
*/
@Test
public void testEmbedded() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = ZipParserTest.class.getResourceAsStream("/test-documents/test-documents.tbz2")) {
parser.parse(stream, handler, metadata, trackingContext);
}
// Should find a single entry, for the (compressed) tar file
assertEquals(1, tracker.filenames.size());
assertEquals(1, tracker.mediatypes.size());
assertEquals(1, tracker.modifiedAts.size());
assertEquals(null, tracker.filenames.get(0));
assertEquals(null, tracker.mediatypes.get(0));
assertEquals(null, tracker.createdAts.get(0));
assertEquals(null, tracker.modifiedAts.get(0));
// Tar file starts with the directory name
assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class Bzip2ParserTest method testBzip2Parsing.
@Test
public void testBzip2Parsing() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = Bzip2ParserTest.class.getResourceAsStream("/test-documents/test-documents.tbz2")) {
parser.parse(stream, handler, metadata, recursingContext);
}
assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("test-documents/testEXCEL.xls", content);
assertContains("Sample Excel Worksheet", content);
assertContains("test-documents/testHTML.html", content);
assertContains("Test Indexation Html", content);
assertContains("test-documents/testOpenOffice2.odt", content);
assertContains("This is a sample Open Office document", content);
assertContains("test-documents/testPDF.pdf", content);
assertContains("Apache Tika", content);
assertContains("test-documents/testPPT.ppt", content);
assertContains("Sample Powerpoint Slide", content);
assertContains("test-documents/testRTF.rtf", content);
assertContains("indexation Word", content);
assertContains("test-documents/testTXT.txt", content);
assertContains("Test d'indexation de Txt", content);
assertContains("test-documents/testWORD.doc", content);
assertContains("This is a sample Microsoft Word Document", content);
assertContains("test-documents/testXML.xml", content);
assertContains("Rida Benjelloun", content);
}
Aggregations