use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class TIAParsingExample method testTeeContentHandler.
public static void testTeeContentHandler(String filename) throws Exception {
InputStream stream = new ByteArrayInputStream(new byte[0]);
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
Parser parser = new AutoDetectParser();
LinkContentHandler linkCollector = new LinkContentHandler();
try (OutputStream output = new FileOutputStream(new File(filename))) {
ContentHandler handler = new TeeContentHandler(new BodyContentHandler(output), linkCollector);
parser.parse(stream, handler, metadata, context);
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class PhoneExtractingContentHandlerTest method testExtractPhoneNumbers.
@Test
public void testExtractPhoneNumbers() throws Exception {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
// The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them
// to the underlying Handler.
PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
try (InputStream stream = PhoneExtractingContentHandlerTest.class.getResourceAsStream("/test-documents/testPhoneNumberExtractor.odt")) {
parser.parse(stream, handler, metadata, new ParseContext());
}
String[] phoneNumbers = metadata.getValues("phonenumbers");
assertContains("9498888888", phoneNumbers[0]);
assertContains("9497777777", phoneNumbers[1]);
assertContains("9496666666", phoneNumbers[2]);
assertContains("9495555555", phoneNumbers[3]);
assertContains("4193404645", phoneNumbers[4]);
assertContains("9044687081", phoneNumbers[5]);
assertContains("2604094811", phoneNumbers[6]);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class RTFParserTest method testConfig.
@Test
public void testConfig() throws Exception {
//test that memory allocation of the bin element is limited
//via the config file. Unfortunately, this test file's bin embedding contains 10 bytes
//so we had to set the config to 0.
InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/rtf/tika-config.xml");
assertNotNull(is);
TikaConfig tikaConfig = new TikaConfig(is);
Parser p = new AutoDetectParser(tikaConfig);
List<Metadata> metadataList = getRecursiveMetadata("testBinControlWord.rtf", p);
assertEquals(1, metadataList.size());
assertContains("TikaMemoryLimitException", metadataList.get(0).get(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM));
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class Seven7ParserTest method test7ZParsing.
@Test
public void test7ZParsing() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
// Ensure 7zip is a parsable format
assertTrue("No 7zip parser found", parser.getSupportedTypes(recursingContext).contains(TYPE_7ZIP));
// Parse
try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test-documents.7z")) {
parser.parse(stream, handler, metadata, recursingContext);
}
assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("test-documents/testEXCEL.xls", content);
assertContains("Sample Excel Worksheet", content);
assertContains("test-documents/testHTML.html", content);
assertContains("Test Indexation Html", content);
assertContains("test-documents/testOpenOffice2.odt", content);
assertContains("This is a sample Open Office document", content);
assertContains("test-documents/testPDF.pdf", content);
assertContains("Apache Tika", content);
assertContains("test-documents/testPPT.ppt", content);
assertContains("Sample Powerpoint Slide", content);
assertContains("test-documents/testRTF.rtf", content);
assertContains("indexation Word", content);
assertContains("test-documents/testTXT.txt", content);
assertContains("Test d'indexation de Txt", content);
assertContains("test-documents/testWORD.doc", content);
assertContains("This is a sample Microsoft Word Document", content);
assertContains("test-documents/testXML.xml", content);
assertContains("Rida Benjelloun", content);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class Seven7ParserTest method testEmbedded.
/**
* Tests that the ParseContext parser is correctly
* fired for all the embedded entries.
*/
@Test
public void testEmbedded() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test-documents.7z")) {
parser.parse(stream, handler, metadata, trackingContext);
}
// Should have found all 9 documents, but not the directory
assertEquals(9, tracker.filenames.size());
assertEquals(9, tracker.mediatypes.size());
assertEquals(9, tracker.modifiedAts.size());
// Should have names but not content types, as 7z doesn't
// store the content types
assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
for (String type : tracker.mediatypes) {
assertNull(type);
}
for (String mod : tracker.modifiedAts) {
assertNotNull(mod);
assertTrue("Modified at " + mod, mod.startsWith("20"));
}
}
Aggregations