use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class ParserPostProcessor method parse.
/**
* Forwards the call to the delegated parser and post-processes the
* results as described above.
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
ContentHandler body = new BodyContentHandler();
ContentHandler tee = new TeeContentHandler(handler, body);
super.parse(stream, tee, metadata, context);
String content = body.toString();
metadata.set("fulltext", content);
int length = Math.min(content.length(), 500);
metadata.set("summary", content.substring(0, length));
for (String link : RegexUtils.extractLinks(content)) {
metadata.add("outlinks", link);
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class AutoDetectParserTest method testOggFlacAudio.
/**
* Test to ensure that the Ogg Audio parsers (Vorbis, Opus, Flac etc)
* have been correctly included, and are available
*/
@SuppressWarnings("deprecation")
@Test
public void testOggFlacAudio() throws Exception {
// The three test files should all have similar test data
String[] testFiles = new String[] { "testVORBIS.ogg", "testFLAC.flac", "testFLAC.oga", "testOPUS.opus" };
MediaType[] mediaTypes = new MediaType[] { MediaType.parse(OGG_VORBIS), MediaType.parse(FLAC_NATIVE), MediaType.parse(OGG_FLAC), MediaType.parse(OGG_OPUS) };
// Check we can load the parsers, and they claim to do the right things
VorbisParser vParser = new VorbisParser();
assertNotNull("Parser not found for " + mediaTypes[0], vParser.getSupportedTypes(new ParseContext()));
FlacParser fParser = new FlacParser();
assertNotNull("Parser not found for " + mediaTypes[1], fParser.getSupportedTypes(new ParseContext()));
assertNotNull("Parser not found for " + mediaTypes[2], fParser.getSupportedTypes(new ParseContext()));
OpusParser oParser = new OpusParser();
assertNotNull("Parser not found for " + mediaTypes[3], oParser.getSupportedTypes(new ParseContext()));
// Check we found the parser
CompositeParser parser = (CompositeParser) tika.getParser();
for (MediaType mt : mediaTypes) {
assertNotNull("Parser not found for " + mt, parser.getParsers().get(mt));
}
// Have each file parsed, and check
for (int i = 0; i < testFiles.length; i++) {
String file = testFiles[i];
try (InputStream input = AutoDetectParserTest.class.getResourceAsStream("/test-documents/" + file)) {
if (input == null) {
fail("Could not find test file " + file);
}
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new AutoDetectParser(tika).parse(input, handler, metadata);
assertEquals("Incorrect content type for " + file, mediaTypes[i].toString(), metadata.get(Metadata.CONTENT_TYPE));
// Check some of the common metadata
// Old style metadata
assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
assertEquals("Test Title", metadata.get(Metadata.TITLE));
// New style metadata
assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
// Check some of the XMPDM metadata
if (!file.endsWith(".opus")) {
assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
}
assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
// Check some of the text
String content = handler.toString();
assertTrue(content.contains("Test Title"));
assertTrue(content.contains("Test Artist"));
}
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class AutoDetectParserTest method testNoBombDetectedForInvalidXml.
/**
* Make sure XML parse errors don't trigger ZIP bomb detection.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-1322">TIKA-1322</a>
*/
@Test
public void testNoBombDetectedForInvalidXml() throws Exception {
// create zip with ten empty / invalid XML files, 1.xml .. 10.xml
ByteArrayOutputStream baos = new ByteArrayOutputStream();
ZipOutputStream zos = new ZipOutputStream(baos);
for (int i = 1; i <= 10; i++) {
zos.putNextEntry(new ZipEntry(i + ".xml"));
zos.closeEntry();
}
zos.finish();
zos.close();
new AutoDetectParser(tika).parse(new ByteArrayInputStream(baos.toByteArray()), new BodyContentHandler(-1), new Metadata());
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class AutoDetectParserTest method testSpecificParserList.
/**
* Test case for TIKA-514. Provide constructor for AutoDetectParser that has explicit
* list of supported parsers.
* @see <a href="https://issues.apache.org/jira/browse/TIKA-514">TIKA-514</a>
*/
@Test
public void testSpecificParserList() throws Exception {
AutoDetectParser parser = new AutoDetectParser(new MyDetector(), new MyParser());
InputStream is = new ByteArrayInputStream("test".getBytes(UTF_8));
Metadata metadata = new Metadata();
parser.parse(is, new BodyContentHandler(), metadata, new ParseContext());
assertEquals("value", metadata.get("MyParser"));
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class MyFirstTika method parseUsingAutoDetect.
public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception {
System.out.println("Handling using AutoDetectParser: [" + filename + "]");
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
ContentHandler handler = new BodyContentHandler();
TikaInputStream stream = TikaInputStream.get(new File(filename), metadata);
parser.parse(stream, handler, metadata, new ParseContext());
return handler.toString();
}
Aggregations