use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class BundleIT method testTikaBundle.
@Test
public void testTikaBundle() throws Exception {
Tika tika = new Tika();
// Package extraction
ContentHandler handler = new BodyContentHandler();
Parser parser = tika.getParser();
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
try (InputStream stream = new FileInputStream("src/test/resources/test-documents.zip")) {
parser.parse(stream, handler, new Metadata(), context);
}
String content = handler.toString();
assertTrue(content.contains("testEXCEL.xls"));
assertTrue(content.contains("Sample Excel Worksheet"));
assertTrue(content.contains("testHTML.html"));
assertTrue(content.contains("Test Indexation Html"));
assertTrue(content.contains("testOpenOffice2.odt"));
assertTrue(content.contains("This is a sample Open Office document"));
assertTrue(content.contains("testPDF.pdf"));
assertTrue(content.contains("Apache Tika"));
assertTrue(content.contains("testPPT.ppt"));
assertTrue(content.contains("Sample Powerpoint Slide"));
assertTrue(content.contains("testRTF.rtf"));
assertTrue(content.contains("indexation Word"));
assertTrue(content.contains("testTXT.txt"));
assertTrue(content.contains("Test d'indexation de Txt"));
assertTrue(content.contains("testWORD.doc"));
assertTrue(content.contains("This is a sample Microsoft Word Document"));
assertTrue(content.contains("testXML.xml"));
assertTrue(content.contains("Rida Benjelloun"));
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class OutlookExtractor method parse.
public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException {
try {
msg.setReturnNullOnMissingChunk(true);
try {
metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
} catch (ChunkNotFoundException e) {
}
// as Unicode, try to sort out an encoding for them
if (msg.has7BitEncodingStrings()) {
guess7BitEncoding(msg);
}
// Start with the metadata
String subject = msg.getSubject();
Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
String from = msg.getDisplayFrom();
handleFromTo(headers, metadata);
metadata.set(TikaCoreProperties.TITLE, subject);
// TODO: Move to description in Tika 2.0
metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());
try {
for (String recipientAddress : msg.getRecipientEmailAddressList()) {
if (recipientAddress != null)
metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
}
} catch (ChunkNotFoundException he) {
}
for (Map.Entry<String, String[]> e : headers.entrySet()) {
String headerKey = e.getKey();
for (String headerValue : e.getValue()) {
metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
}
}
// First try via the proper chunk
if (msg.getMessageDate() != null) {
metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
} else {
if (headers != null && headers.size() > 0) {
for (Map.Entry<String, String[]> header : headers.entrySet()) {
String headerKey = header.getKey();
if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
// See if we can parse it as a normal mail date
try {
Date d = MboxParser.parseDate(date);
metadata.set(TikaCoreProperties.CREATED, d);
metadata.set(TikaCoreProperties.MODIFIED, d);
} catch (ParseException e) {
// Store it as-is, and hope for the best...
metadata.set(TikaCoreProperties.CREATED, date);
metadata.set(TikaCoreProperties.MODIFIED, date);
}
break;
}
}
}
}
xhtml.element("h1", subject);
// Output the from and to details in text, as you
// often want them in text form for searching
xhtml.startElement("dl");
if (from != null) {
header(xhtml, "From", from);
}
header(xhtml, "To", msg.getDisplayTo());
header(xhtml, "Cc", msg.getDisplayCC());
header(xhtml, "Bcc", msg.getDisplayBCC());
try {
header(xhtml, "Recipients", msg.getRecipientEmailAddress());
} catch (ChunkNotFoundException e) {
}
xhtml.endElement("dl");
// Get the message body. Preference order is: html, rtf, text
Chunk htmlChunk = null;
Chunk rtfChunk = null;
Chunk textChunk = null;
for (Chunk chunk : msg.getMainChunks().getChunks()) {
if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
htmlChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
rtfChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.BODY.id) {
textChunk = chunk;
}
}
boolean doneBody = false;
xhtml.startElement("div", "class", "message-body");
if (htmlChunk != null) {
byte[] data = null;
if (htmlChunk instanceof ByteChunk) {
data = ((ByteChunk) htmlChunk).getValue();
} else if (htmlChunk instanceof StringChunk) {
data = ((StringChunk) htmlChunk).getRawValue();
}
if (data != null) {
Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
if (htmlParser == null) {
htmlParser = new HtmlParser();
}
htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
doneBody = true;
}
}
if (rtfChunk != null && !doneBody) {
ByteChunk chunk = (ByteChunk) rtfChunk;
MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue());
Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
if (rtfParser == null) {
rtfParser = new RTFParser();
}
rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
doneBody = true;
}
if (textChunk != null && !doneBody) {
xhtml.element("p", ((StringChunk) textChunk).getValue());
}
xhtml.endElement("div");
// Process the attachments
for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
xhtml.startElement("div", "class", "attachment-entry");
String filename = null;
if (attachment.getAttachLongFileName() != null) {
filename = attachment.getAttachLongFileName().getValue();
} else if (attachment.getAttachFileName() != null) {
filename = attachment.getAttachFileName().getValue();
}
if (filename != null && filename.length() > 0) {
xhtml.element("h1", filename);
}
if (attachment.getAttachData() != null) {
handleEmbeddedResource(TikaInputStream.get(attachment.getAttachData().getValue()), filename, null, null, xhtml, true);
}
if (attachment.getAttachmentDirectory() != null) {
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), xhtml);
}
xhtml.endElement("div");
}
} catch (ChunkNotFoundException e) {
throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
} finally {
//You'd think you'd want to call msg.close().
//Don't do that. That closes down the file system.
//If an msg has multiple msg attachments, some of them
//can reside in the same file system. After the first
//child is read, the fs is closed, and the other children
//get a java.nio.channels.ClosedChannelException
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class ParsingEmbeddedDocumentExtractor method parseEmbedded.
public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
if (outputHtml) {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
handler.startElement(XHTML, "div", "div", attributes);
}
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name != null && name.length() > 0 && outputHtml) {
handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
char[] chars = name.toCharArray();
handler.characters(chars, 0, chars.length);
handler.endElement(XHTML, "h1", "h1");
}
// Use the delegate parser to parse this entry
try (TemporaryResources tmp = new TemporaryResources()) {
final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
if (stream instanceof TikaInputStream) {
final Object container = ((TikaInputStream) stream).getOpenContainer();
if (container != null) {
newStream.setOpenContainer(container);
}
}
DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)), metadata, context);
} catch (EncryptedDocumentException ede) {
// TODO: can we log a warning that we lack the password?
// For now, just skip the content
} catch (TikaException e) {
// TODO: can we log a warning somehow?
// Could not parse the entry, just skip the content
}
if (outputHtml) {
handler.endElement(XHTML, "div", "div");
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class EpubParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Because an EPub file is often made up of multiple XHTML files,
// we need explicit control over the start and end of the document
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
ContentHandler childHandler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));
ZipInputStream zip = new ZipInputStream(stream);
ZipEntry entry = zip.getNextEntry();
while (entry != null) {
if (entry.getName().equals("mimetype")) {
String type = IOUtils.toString(zip, UTF_8);
//often has trailing new lines
if (type != null) {
type = type.trim();
}
metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals("metadata.xml")) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith(".opf")) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith(".html") || entry.getName().endsWith(".xhtml")) {
content.parse(zip, childHandler, metadata, context);
}
entry = zip.getNextEntry();
}
// Finish everything
xhtml.endDocument();
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class NetCDFParserTest method testParseGlobalMetadata.
@Test
public void testParseGlobalMetadata() throws Exception {
Parser parser = new NetCDFParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = NetCDFParser.class.getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc")) {
parser.parse(stream, handler, metadata, new ParseContext());
}
assertEquals(metadata.get(TikaCoreProperties.TITLE), "model output prepared for IPCC AR4");
assertEquals(metadata.get(Metadata.CONTACT), "ccsm@ucar.edu");
assertEquals(metadata.get(Metadata.PROJECT_ID), "IPCC Fourth Assessment");
assertEquals(metadata.get(Metadata.CONVENTIONS), "CF-1.0");
assertEquals(metadata.get(Metadata.REALIZATION), "1");
assertEquals(metadata.get(Metadata.EXPERIMENT_ID), "720 ppm stabilization experiment (SRESA1B)");
assertEquals(metadata.get("File-Type-Description"), "NetCDF-3/CDM");
String content = handler.toString();
assertContains("long_name = \"Surface area\"", content);
assertContains("float area(lat=128, lon=256)", content);
assertContains("float lat(lat=128)", content);
assertContains("double lat_bnds(lat=128, bnds=2)", content);
assertContains("double lon_bnds(lon=256, bnds=2)", content);
}
Aggregations