use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class ParsingExample method parseEmbeddedExample.
/**
* This example shows how to extract content from the outer document and all
* embedded documents. The key is to specify a {@link Parser} in the {@link ParseContext}.
*
* @return content, including from embedded documents
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public String parseEmbeddedExample() throws IOException, SAXException, TikaException {
AutoDetectParser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
parser.parse(stream, handler, metadata, context);
return handler.toString();
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class ExtractEmbeddedFiles method extract.
public void extract(InputStream is, Path outputDir) throws SAXException, TikaException, IOException {
Metadata m = new Metadata();
ParseContext c = new ParseContext();
ContentHandler h = new BodyContentHandler(-1);
c.set(Parser.class, parser);
EmbeddedDocumentExtractor ex = new MyEmbeddedDocumentExtractor(outputDir, c);
c.set(EmbeddedDocumentExtractor.class, ex);
parser.parse(is, h, m, c);
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class ContentHandlerExample method parseBodyToHTML.
/**
* Example of extracting just the body as HTML, without the
* head part, as a string
*/
public String parseBodyToHTML() throws IOException, SAXException, TikaException {
ContentHandler handler = new BodyContentHandler(new ToXMLContentHandler());
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {
parser.parse(stream, handler, metadata);
return handler.toString();
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class DisplayMetInstance method getMet.
public static Metadata getMet(URL url) throws IOException, SAXException, TikaException {
Metadata met = new Metadata();
PDFParser parser = new PDFParser();
parser.parse(url.openStream(), new BodyContentHandler(), met, new ParseContext());
return met;
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class FontParsersTest method testTTFParsing.
@Test
public void testTTFParsing() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
try (TikaInputStream stream = TikaInputStream.get(FontParsersTest.class.getResource("/test-documents/testTrueType3.ttf"))) {
parser.parse(stream, handler, metadata, context);
}
assertEquals("application/x-font-ttf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Open Sans Bold", metadata.get(TikaCoreProperties.TITLE));
assertEquals("2010-12-30T11:04:00Z", metadata.get(Metadata.CREATION_DATE));
assertEquals("2010-12-30T11:04:00Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals("2011-05-05T12:37:53Z", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("Open Sans Bold", metadata.get(MET_FONT_NAME));
assertEquals("Open Sans", metadata.get(MET_FONT_FAMILY_NAME));
assertEquals("Bold", metadata.get(MET_FONT_SUB_FAMILY_NAME));
assertEquals("OpenSans-Bold", metadata.get(MET_PS_NAME));
assertEquals("Digitized", metadata.get("Copyright").substring(0, 9));
assertEquals("Open Sans", metadata.get("Trademark").substring(0, 9));
// Not extracted
assertEquals(null, metadata.get(MET_FONT_FULL_NAME));
assertEquals(null, metadata.get(MET_FONT_WEIGHT));
assertEquals(null, metadata.get(MET_FONT_VERSION));
// Currently, the parser doesn't extract any contents
String content = handler.toString();
assertEquals("", content);
}
Aggregations