use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class ODFParserTest method testODSFooter.
@Test
public void testODSFooter() throws Exception {
try (InputStream input = ODFParserTest.class.getResourceAsStream("/test-documents/testFooter.ods")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new AutoDetectParser().parse(input, handler, metadata);
String content = handler.toString();
assertContains("Here is a footer in the center area", content);
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class PDFParserTest method testProtectedPDF.
/**
* PDFs can be "protected" with the default password. This means
* they're encrypted (potentially both text and metadata),
* but we can decrypt them easily.
*/
@Test
public void testProtectedPDF() throws Exception {
XMLResult r = getXML("testPDF_protected.pdf");
Metadata metadata = r.metadata;
assertEquals("true", metadata.get("pdf:encrypted"));
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE));
assertContains("RETHINKING THE FINANCIAL NETWORK", r.xml);
assertContains("On 16 November 2002", r.xml);
assertContains("In many important respects", r.xml);
// Try again with an explicit empty password
ParseContext context = new ParseContext();
context.set(PasswordProvider.class, new PasswordProvider() {
public String getPassword(Metadata metadata) {
return "";
}
});
r = getXML("testPDF_protected.pdf", context);
metadata = r.metadata;
assertEquals("true", metadata.get("pdf:encrypted"));
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE));
assertContains("RETHINKING THE FINANCIAL NETWORK", r.xml);
assertContains("On 16 November 2002", r.xml);
assertContains("In many important respects", r.xml);
//now test wrong password
context.set(PasswordProvider.class, new PasswordProvider() {
public String getPassword(Metadata metadata) {
return "WRONG!!!!";
}
});
boolean ex = false;
ContentHandler handler = new BodyContentHandler();
metadata = new Metadata();
try (InputStream stream = PDFParserTest.class.getResourceAsStream("/test-documents/testPDF_protected.pdf")) {
Parser parser = new AutoDetectParser();
parser.parse(stream, handler, metadata, context);
} catch (EncryptedDocumentException e) {
ex = true;
}
assertTrue("encryption exception", ex);
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("true", metadata.get("pdf:encrypted"));
//pdf:encrypted, X-Parsed-By and Content-Type
assertEquals("very little metadata should be parsed", 3, metadata.names().length);
assertEquals(0, handler.toString().length());
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class RarParserTest method testEmbedded.
/**
* Tests that the ParseContext parser is correctly
* fired for all the embedded entries.
*/
@Test
public void testEmbedded() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = RarParserTest.class.getResourceAsStream("/test-documents/test-documents.rar")) {
parser.parse(stream, handler, metadata, trackingContext);
}
// Should have found all 9 documents, but not the directory
assertEquals(9, tracker.filenames.size());
assertEquals(9, tracker.mediatypes.size());
assertEquals(9, tracker.modifiedAts.size());
// Should have names but not content types, as rar doesn't
// store the content types
assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
for (String type : tracker.mediatypes) {
assertNull(type);
}
for (String crt : tracker.createdAts) {
assertNull(crt);
}
for (String mod : tracker.modifiedAts) {
assertNotNull(mod);
assertTrue("Modified at " + mod, mod.startsWith("20"));
}
// Should have filenames in the content string
String content = handler.toString();
assertContains("test-documents/testHTML.html", content);
assertContains("test-documents/testEXCEL.xls", content);
assertContains("test-documents/testOpenOffice2.odt", content);
assertContains("test-documents/testPDF.pdf", content);
assertContains("test-documents/testPPT.ppt", content);
assertContains("test-documents/testRTF.rtf", content);
assertContains("test-documents/testTXT.txt", content);
assertContains("test-documents/testWORD.doc", content);
assertContains("test-documents/testXML.xml", content);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class RarParserTest method testRarParsing.
@Test
public void testRarParsing() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = RarParserTest.class.getResourceAsStream("/test-documents/test-documents.rar")) {
parser.parse(stream, handler, metadata, recursingContext);
}
assertEquals("application/x-rar-compressed", metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("test-documents/testEXCEL.xls", content);
assertContains("Sample Excel Worksheet", content);
assertContains("test-documents/testHTML.html", content);
assertContains("Test Indexation Html", content);
assertContains("test-documents/testOpenOffice2.odt", content);
assertContains("This is a sample Open Office document", content);
assertContains("test-documents/testPDF.pdf", content);
assertContains("Apache Tika", content);
assertContains("test-documents/testPPT.ppt", content);
assertContains("Sample Powerpoint Slide", content);
assertContains("test-documents/testRTF.rtf", content);
assertContains("indexation Word", content);
assertContains("test-documents/testTXT.txt", content);
assertContains("Test d'indexation de Txt", content);
assertContains("test-documents/testWORD.doc", content);
assertContains("This is a sample Microsoft Word Document", content);
assertContains("test-documents/testXML.xml", content);
assertContains("Rida Benjelloun", content);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class TarParserTest method testTarParsing.
@Test
public void testTarParsing() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = TarParserTest.class.getResourceAsStream("/test-documents/test-documents.tar")) {
parser.parse(stream, handler, metadata, recursingContext);
}
assertEquals("application/x-gtar", metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("test-documents/testEXCEL.xls", content);
assertContains("Sample Excel Worksheet", content);
assertContains("test-documents/testHTML.html", content);
assertContains("Test Indexation Html", content);
assertContains("test-documents/testOpenOffice2.odt", content);
assertContains("This is a sample Open Office document", content);
assertContains("test-documents/testPDF.pdf", content);
assertContains("Apache Tika", content);
assertContains("test-documents/testPPT.ppt", content);
assertContains("Sample Powerpoint Slide", content);
assertContains("test-documents/testRTF.rtf", content);
assertContains("indexation Word", content);
assertContains("test-documents/testTXT.txt", content);
assertContains("Test d'indexation de Txt", content);
assertContains("test-documents/testWORD.doc", content);
assertContains("This is a sample Microsoft Word Document", content);
assertContains("test-documents/testXML.xml", content);
assertContains("Rida Benjelloun", content);
}
Aggregations