use of org.xml.sax.ContentHandler in project tika by apache.
the class ForkParserIntegrationTest method testAttachingADebuggerOnTheForkedParserShouldWork.
/**
* TIKA-832
*/
@Test
public void testAttachingADebuggerOnTheForkedParserShouldWork() throws Exception {
ParseContext context = new ParseContext();
context.set(Parser.class, tika.getParser());
ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser());
parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Xdebug", "-Xrunjdwp:transport=dt_socket,address=54321,server=y,suspend=n"));
try {
ContentHandler body = new BodyContentHandler();
InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt");
parser.parse(stream, body, new Metadata(), context);
String content = body.toString();
assertContains("Test d'indexation", content);
assertContains("http://www.apache.org", content);
} finally {
parser.close();
}
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class TikaResource method createParser.
@SuppressWarnings("serial")
public static Parser createParser() {
final Parser parser = new AutoDetectParser(tikaConfig);
Map<MediaType, Parser> parsers = ((AutoDetectParser) parser).getParsers();
parsers.put(MediaType.APPLICATION_XML, new HtmlParser());
((AutoDetectParser) parser).setParsers(parsers);
((AutoDetectParser) parser).setFallback(new Parser() {
public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
return parser.getSupportedTypes(parseContext);
}
public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) {
throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE);
}
});
if (digester != null) {
return new DigestingParser(parser, digester);
}
return parser;
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class TikaResource method produceOutput.
private StreamingOutput produceOutput(final InputStream is, final MultivaluedMap<String, String> httpHeaders, final UriInfo info, final String format) {
final Parser parser = createParser();
final Metadata metadata = new Metadata();
final ParseContext context = new ParseContext();
fillMetadata(parser, metadata, context, httpHeaders);
fillParseContext(context, httpHeaders, parser);
logRequest(LOG, info, metadata);
return new StreamingOutput() {
public void write(OutputStream outputStream) throws IOException, WebApplicationException {
Writer writer = new OutputStreamWriter(outputStream, UTF_8);
ContentHandler content;
try {
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, format);
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, UTF_8.name());
handler.setResult(new StreamResult(writer));
content = new ExpandedTitleContentHandler(handler);
} catch (TransformerConfigurationException e) {
throw new WebApplicationException(e);
}
parse(parser, LOG, info.getPath(), is, content, metadata, context);
}
};
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class UnpackerResource method process.
private Map<String, byte[]> process(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info, boolean saveAll) throws Exception {
Metadata metadata = new Metadata();
ParseContext pc = new ParseContext();
Parser parser = TikaResource.createParser();
if (parser instanceof DigestingParser) {
//no need to digest for unwrapping
parser = ((DigestingParser) parser).getWrappedParser();
}
TikaResource.fillMetadata(parser, metadata, pc, httpHeaders.getRequestHeaders());
TikaResource.logRequest(LOG, info, metadata);
ContentHandler ch;
ByteArrayOutputStream text = new ByteArrayOutputStream();
if (saveAll) {
ch = new BodyContentHandler(new RichTextContentHandler(new OutputStreamWriter(text, UTF_8)));
} else {
ch = new DefaultHandler();
}
Map<String, byte[]> files = new HashMap<>();
MutableInt count = new MutableInt();
pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files));
TikaResource.parse(parser, LOG, info.getPath(), is, ch, metadata, pc);
if (count.intValue() == 0 && !saveAll) {
throw new WebApplicationException(Response.Status.NO_CONTENT);
}
if (saveAll) {
files.put(TEXT_FILENAME, text.toByteArray());
ByteArrayOutputStream metaStream = new ByteArrayOutputStream();
metadataToCsv(metadata, metaStream);
files.put(META_FILENAME, metaStream.toByteArray());
}
return files;
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class SXSLFExtractorTest method testPowerPoint.
/**
* We have a number of different powerpoint files,
* such as presentation, macro-enabled etc
*/
@Test
public void testPowerPoint() throws Exception {
String[] extensions = new String[] { "pptx", "pptm", "ppsm", "ppsx", "potm" };
String[] mimeTypes = new String[] { "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-powerpoint.presentation.macroenabled.12", "application/vnd.ms-powerpoint.slideshow.macroenabled.12", "application/vnd.openxmlformats-officedocument.presentationml.slideshow", "application/vnd.ms-powerpoint.template.macroenabled.12" };
for (int i = 0; i < extensions.length; i++) {
String extension = extensions[i];
String filename = "testPPT." + extension;
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
try (InputStream input = getResourceAsStream("/test-documents/" + filename)) {
parser.parse(input, handler, metadata, parseContext);
assertEquals("Mime-type checking for " + filename, mimeTypes[i], metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
String content = handler.toString();
// Theme files don't have the text in them
if (extension.equals("thmx")) {
assertEquals("", content);
} else {
assertTrue("Text missing for " + filename + "\n" + content, content.contains("Attachment Test"));
assertTrue("Text missing for " + filename + "\n" + content, content.contains("This is a test file data with the same content"));
assertTrue("Text missing for " + filename + "\n" + content, content.contains("content parsing"));
assertTrue("Text missing for " + filename + "\n" + content, content.contains("Different words to test against"));
assertTrue("Text missing for " + filename + "\n" + content, content.contains("Mystery"));
}
}
}
}
Aggregations