use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class Seven7ParserTest method test7ZParsing.
@Test
public void test7ZParsing() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
// Ensure 7zip is a parsable format
assertTrue("No 7zip parser found", parser.getSupportedTypes(recursingContext).contains(TYPE_7ZIP));
// Parse
try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test-documents.7z")) {
parser.parse(stream, handler, metadata, recursingContext);
}
assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("test-documents/testEXCEL.xls", content);
assertContains("Sample Excel Worksheet", content);
assertContains("test-documents/testHTML.html", content);
assertContains("Test Indexation Html", content);
assertContains("test-documents/testOpenOffice2.odt", content);
assertContains("This is a sample Open Office document", content);
assertContains("test-documents/testPDF.pdf", content);
assertContains("Apache Tika", content);
assertContains("test-documents/testPPT.ppt", content);
assertContains("Sample Powerpoint Slide", content);
assertContains("test-documents/testRTF.rtf", content);
assertContains("indexation Word", content);
assertContains("test-documents/testTXT.txt", content);
assertContains("Test d'indexation de Txt", content);
assertContains("test-documents/testWORD.doc", content);
assertContains("This is a sample Microsoft Word Document", content);
assertContains("test-documents/testXML.xml", content);
assertContains("Rida Benjelloun", content);
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class Seven7ParserTest method testEmbedded.
/**
* Tests that the ParseContext parser is correctly
* fired for all the embedded entries.
*/
@Test
public void testEmbedded() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test-documents.7z")) {
parser.parse(stream, handler, metadata, trackingContext);
}
// Should have found all 9 documents, but not the directory
assertEquals(9, tracker.filenames.size());
assertEquals(9, tracker.mediatypes.size());
assertEquals(9, tracker.modifiedAts.size());
// Should have names but not content types, as 7z doesn't
// store the content types
assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
for (String type : tracker.mediatypes) {
assertNull(type);
}
for (String mod : tracker.modifiedAts) {
assertNotNull(mod);
assertTrue("Modified at " + mod, mod.startsWith("20"));
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class ZlibParserTest method testZlibParsing.
@Test
public void testZlibParsing() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = ZipParserTest.class.getResourceAsStream("/test-documents/testTXT.zlib")) {
parser.parse(stream, handler, metadata, recursingContext);
}
assertEquals("application/zlib", metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("Test d'indexation de Txt", content);
assertContains("http://www.apache.org", content);
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class PRTParserTest method testPRTParserComplex.
/**
* Now a more complex one
*/
@Test
public void testPRTParserComplex() throws Exception {
try (InputStream input = getResourceAsStream("/test-documents/testCADKEY2.prt")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new PRTParser().parse(input, handler, metadata);
assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
// File has both a date and a description
assertEquals("1997-04-01T08:59:00", metadata.get(Metadata.DATE));
assertEquals("1997-04-01T08:59:00", metadata.get(Metadata.CREATION_DATE));
assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n", metadata.get(TikaCoreProperties.DESCRIPTION));
String contents = handler.toString();
assertContains("ITEM", contents);
assertContains("REQ.", contents);
assertContains("DESCRIPTION", contents);
assertContains("MAT'L", contents);
assertContains("TOLERANCES UNLESS", contents);
assertContains("FRACTIONS", contents);
assertContains("ANGLES", contents);
assertContains("Acme Corporation", contents);
assertContains("DATE", contents);
assertContains("CHANGE", contents);
assertContains("DRAWN BY", contents);
assertContains("SCALE", contents);
assertContains("TIKA TEST DRAWING", contents);
assertContains("TIKA LETTERS", contents);
assertContains("5.82", contents);
// Degrees
assertContains("112" + '°', contents);
assertContains("TIKA TEST LETTER", contents);
assertContains("17.11", contents);
// Diameter
assertContains('Ø' + "�2.000", contents);
assertContains("Diameter", contents);
assertContains("The Apache Tika toolkit", contents);
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class PRTParserTest method testPRTParserBasics.
/**
* Try with a simple file
*/
@Test
public void testPRTParserBasics() throws Exception {
try (InputStream input = getResourceAsStream("/test-documents/testCADKEY.prt")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new PRTParser().parse(input, handler, metadata);
assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
// This file has a date
assertEquals("2011-06-20T16:54:00", metadata.get(TikaCoreProperties.CREATED));
assertEquals("2011-06-20T16:54:00", metadata.get(Metadata.CREATION_DATE));
// But no description
assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
String contents = handler.toString();
assertContains("Front View", contents);
assertContains("Back View", contents);
assertContains("Bottom View", contents);
assertContains("Right View", contents);
assertContains("Left View", contents);
//assertContains("Isometric View", contents); // Can't detect yet
assertContains("Axonometric View", contents);
assertContains("You've managed to extract all the text!", contents);
assertContains("This is more text", contents);
assertContains("Text Inside a PRT file", contents);
}
}
Aggregations