use of org.xml.sax.ContentHandler in project tika by apache.
the class PowerPointParserTest method testMasterFooter.
@Test
public void testMasterFooter() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = PowerPointParserTest.class.getResourceAsStream("/test-documents/testPPT_masterFooter.ppt")) {
new OfficeParser().parse(stream, handler, metadata, new ParseContext());
}
String content = handler.toString();
assertContains("Master footer is here", content);
// Make sure boilerplate text didn't come through:
assertEquals(-1, content.indexOf("Click to edit Master"));
//TIKA-1171
assertEquals(-1, content.indexOf("*"));
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class PublisherParserTest method testPublisherParser.
@Test
public void testPublisherParser() throws Exception {
try (InputStream input = PublisherParserTest.class.getResourceAsStream("/test-documents/testPUBLISHER.pub")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new OfficeParser().parse(input, handler, metadata, new ParseContext());
assertEquals("application/x-mspublisher", metadata.get(Metadata.CONTENT_TYPE));
assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
assertEquals("Nick Burch", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Nick Burch", metadata.get(Metadata.AUTHOR));
String content = handler.toString();
assertContains("0123456789", content);
assertContains("abcdef", content);
}
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class ForkParserTest method testParallelParsing.
@Test
public void testParallelParsing() throws Exception {
final ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser());
try {
final ParseContext context = new ParseContext();
Thread[] threads = new Thread[10];
ContentHandler[] output = new ContentHandler[threads.length];
for (int i = 0; i < threads.length; i++) {
final ContentHandler o = new BodyContentHandler();
output[i] = o;
threads[i] = new Thread() {
public void run() {
try {
InputStream stream = new ByteArrayInputStream(new byte[0]);
parser.parse(stream, o, new Metadata(), context);
} catch (Exception e) {
e.printStackTrace();
}
}
};
threads[i].start();
}
for (int i = 0; i < threads.length; i++) {
threads[i].join();
assertEquals("Hello, World!", output[i].toString().trim());
}
} finally {
parser.close();
}
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class ForkParserTest method testPoolSizeReached.
@Test
public void testPoolSizeReached() throws Exception {
final ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser());
try {
final Semaphore barrier = new Semaphore(0);
Thread[] threads = new Thread[parser.getPoolSize()];
PipedOutputStream[] pipes = new PipedOutputStream[threads.length];
final ParseContext context = new ParseContext();
for (int i = 0; i < threads.length; i++) {
final PipedInputStream input = new PipedInputStream() {
@Override
public synchronized int read() throws IOException {
barrier.release();
return super.read();
}
};
pipes[i] = new PipedOutputStream(input);
threads[i] = new Thread() {
public void run() {
try {
ContentHandler o = new DefaultHandler();
parser.parse(input, o, new Metadata(), context);
} catch (Exception e) {
e.printStackTrace();
}
}
};
threads[i].start();
}
// Wait until all the background parsers have been started
barrier.acquire(parser.getPoolSize());
final ContentHandler o = new BodyContentHandler();
Thread blocked = new Thread() {
public void run() {
try {
barrier.release();
InputStream stream = new ByteArrayInputStream(new byte[0]);
parser.parse(stream, o, new Metadata(), context);
} catch (Exception e) {
e.printStackTrace();
}
}
};
blocked.start();
// Wait until the last thread is started, and then some to
// make sure that it would have had a chance to start processing
// data had it not been blocked.
barrier.acquire();
Thread.sleep(1000);
assertEquals("", o.toString());
for (int i = 0; i < threads.length; i++) {
pipes[i].close();
threads[i].join();
}
blocked.join();
assertEquals("Hello, World!", o.toString().trim());
} finally {
parser.close();
}
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class BasicContentHandlerFactoryTest method testBody.
@Test
public void testBody() throws Exception {
Parser p = new MockParser(OVER_DEFAULT);
BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.BODY;
ContentHandler handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler();
assertTrue(handler instanceof BodyContentHandler);
p.parse(null, handler, null, null);
String extracted = handler.toString();
assertNotContains("title", extracted);
assertContains("aaaaaaaaaa", extracted);
assertTrue(extracted.length() > 110000);
//now test write limit
p = new MockParser(10);
handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
assertTrue(handler instanceof BodyContentHandler);
assertWriteLimitReached(p, (BodyContentHandler) handler);
extracted = handler.toString();
assertNotContains("This ", extracted);
assertContains("aaaa", extracted);
//now test outputstream call
p = new MockParser(OVER_DEFAULT);
ByteArrayOutputStream os = new ByteArrayOutputStream();
handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING);
assertTrue(handler instanceof BodyContentHandler);
p.parse(null, handler, null, null);
assertNotContains("title", os.toByteArray());
assertContains("aaaaaaaaaa", os.toByteArray());
assertNotContains("<body", os.toByteArray());
assertNotContains("<html", os.toByteArray());
assertTrue(os.toByteArray().length > 110000);
p = new MockParser(10);
os = new ByteArrayOutputStream();
handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, ENCODING);
assertTrue(handler instanceof WriteOutContentHandler);
assertWriteLimitReached(p, (WriteOutContentHandler) handler);
assertEquals(0, os.toByteArray().length);
}
Aggregations