use of org.apache.tika.extractor.EmbeddedDocumentExtractor in project tika by apache.
the class ExtractEmbeddedFiles method extract.
public void extract(InputStream is, Path outputDir) throws SAXException, TikaException, IOException {
Metadata m = new Metadata();
ParseContext c = new ParseContext();
ContentHandler h = new BodyContentHandler(-1);
c.set(Parser.class, parser);
EmbeddedDocumentExtractor ex = new MyEmbeddedDocumentExtractor(outputDir, c);
c.set(EmbeddedDocumentExtractor.class, ex);
parser.parse(is, h, m, c);
}
use of org.apache.tika.extractor.EmbeddedDocumentExtractor in project tika by apache.
the class MockParser method handleEmbedded.
private void handleEmbedded(Node action, XHTMLContentHandler handler, ParseContext context) throws TikaException, SAXException, IOException {
String fileName = "";
String contentType = "";
NamedNodeMap attrs = action.getAttributes();
if (attrs != null) {
Node n = attrs.getNamedItem("filename");
if (n != null) {
fileName = n.getNodeValue();
}
n = attrs.getNamedItem("content-type");
if (n != null) {
contentType = n.getNodeValue();
}
}
String embeddedText = action.getTextContent();
EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(context);
Metadata m = new Metadata();
m.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
if (!"".equals(contentType)) {
m.set(Metadata.CONTENT_TYPE, contentType);
}
InputStream is = new ByteArrayInputStream(embeddedText.getBytes(UTF_8));
extractor.parseEmbedded(is, new EmbeddedContentHandler(handler), m, true);
}
use of org.apache.tika.extractor.EmbeddedDocumentExtractor in project tika by apache.
the class RarParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
Archive rar = null;
try (TemporaryResources tmp = new TemporaryResources()) {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
rar = new Archive(tis.getFile());
if (rar.isEncrypted()) {
throw new EncryptedDocumentException();
}
//Without this BodyContentHandler does not work
xhtml.element("div", " ");
FileHeader header = rar.nextFileHeader();
while (header != null && !Thread.currentThread().isInterrupted()) {
if (!header.isDirectory()) {
try (InputStream subFile = rar.getInputStream(header)) {
Metadata entrydata = PackageParser.handleEntryMetadata("".equals(header.getFileNameW()) ? header.getFileNameString() : header.getFileNameW(), header.getCTime(), header.getMTime(), header.getFullUnpackSize(), xhtml);
if (extractor.shouldParseEmbedded(entrydata)) {
extractor.parseEmbedded(subFile, handler, entrydata, true);
}
}
}
header = rar.nextFileHeader();
}
} catch (RarException e) {
throw new TikaException("RarParser Exception", e);
} finally {
if (rar != null)
rar.close();
}
xhtml.endDocument();
}
use of org.apache.tika.extractor.EmbeddedDocumentExtractor in project tika by apache.
the class RFC822ParserTest method testExtractAttachments.
@Test
public void testExtractAttachments() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
Parser p = new RFC822Parser();
ParseContext context = new ParseContext();
try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) {
p.parse(stream, handler, metadata, context);
}
// Check we go the metadata
assertEquals("Tika Test <XXXX@apache.org>", metadata.get(Metadata.MESSAGE_FROM));
assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE));
// Try again with attachment detecting and fetching
final Detector detector = new DefaultDetector();
final Parser extParser = new AutoDetectParser();
final List<MediaType> seenTypes = new ArrayList<MediaType>();
final List<String> seenText = new ArrayList<String>();
EmbeddedDocumentExtractor ext = new EmbeddedDocumentExtractor() {
@Override
public boolean shouldParseEmbedded(Metadata metadata) {
return true;
}
@Override
public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
seenTypes.add(detector.detect(stream, metadata));
ContentHandler h = new BodyContentHandler();
try {
extParser.parse(stream, h, metadata, new ParseContext());
} catch (TikaException e) {
throw new RuntimeException(e);
}
seenText.add(h.toString());
}
};
context.set(EmbeddedDocumentExtractor.class, ext);
try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) {
p.parse(stream, handler, metadata, context);
}
// Check we go the metadata
assertEquals("Tika Test <XXXX@apache.org>", metadata.get(Metadata.MESSAGE_FROM));
assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE));
// Check attachments
assertEquals(2, seenTypes.size());
assertEquals(2, seenText.size());
assertEquals("text/plain", seenTypes.get(0).toString());
assertEquals("image/png", seenTypes.get(1).toString());
assertEquals("This email has a PNG attachment included in it\n\n", seenText.get(0));
}
use of org.apache.tika.extractor.EmbeddedDocumentExtractor in project tika by apache.
the class MboxParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
String charsetName = "windows-1252";
metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
metadata.set(Metadata.CONTENT_ENCODING, charsetName);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
InputStreamReader isr = new InputStreamReader(stream, charsetName);
try (BufferedReader reader = new BufferedReader(isr)) {
String curLine = reader.readLine();
int mailItem = 0;
do {
if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
Metadata mailMetadata = new Metadata();
Queue<String> multiline = new LinkedList<String>();
mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
curLine = reader.readLine();
if (curLine == null) {
break;
}
ByteArrayOutputStream message = new ByteArrayOutputStream(100000);
do {
if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
String latestLine = multiline.poll();
latestLine += " " + curLine.trim();
multiline.add(latestLine);
} else {
multiline.add(curLine);
}
message.write(curLine.getBytes(charsetName));
message.write(0x0A);
curLine = reader.readLine();
} while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);
for (String item : multiline) {
saveHeaderInMetadata(mailMetadata, item);
}
ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray());
message = null;
if (extractor.shouldParseEmbedded(mailMetadata)) {
extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true);
}
if (tracking) {
getTrackingMetadata().put(mailItem++, mailMetadata);
}
} else {
curLine = reader.readLine();
}
} while (curLine != null && !Thread.currentThread().isInterrupted());
}
xhtml.endDocument();
}
Aggregations