use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class ParserDecoratorTest method withFallback.
/**
* Testing one proposed implementation for TIKA-1509
*/
@Test
public void withFallback() throws Exception {
Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
Set<MediaType> octAndText = new HashSet<MediaType>(Arrays.asList(MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
ParseContext context = new ParseContext();
BodyContentHandler handler;
Metadata metadata;
ErrorParser pFail = new ErrorParser();
DummyParser pWork = new DummyParser(onlyOct, new HashMap<String, String>(), "Fell back!");
EmptyParser pNothing = new EmptyParser();
// Create a combination which will fail first
@SuppressWarnings("deprecation") Parser p = ParserDecorator.withFallbacks(Arrays.asList(pFail, pWork), octAndText);
// Will claim to support the types given, not those on the child parsers
Set<MediaType> types = p.getSupportedTypes(context);
assertEquals(2, types.size());
assertEquals(types.toString(), true, types.contains(MediaType.TEXT_PLAIN));
assertEquals(types.toString(), true, types.contains(MediaType.OCTET_STREAM));
// Parsing will make it to the second one
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] { 0, 1, 2, 3, 4 }), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
// With a parser that will work with no output, will get nothing
p = ParserDecorator.withFallbacks(Arrays.asList(pNothing, pWork), octAndText);
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] { 0, 1, 2, 3, 4 }), handler, metadata, context);
assertEquals("", handler.toString());
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class ForkParserTest method testParallelParsing.
@Test
public void testParallelParsing() throws Exception {
final ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser());
try {
final ParseContext context = new ParseContext();
Thread[] threads = new Thread[10];
ContentHandler[] output = new ContentHandler[threads.length];
for (int i = 0; i < threads.length; i++) {
final ContentHandler o = new BodyContentHandler();
output[i] = o;
threads[i] = new Thread() {
public void run() {
try {
InputStream stream = new ByteArrayInputStream(new byte[0]);
parser.parse(stream, o, new Metadata(), context);
} catch (Exception e) {
e.printStackTrace();
}
}
};
threads[i].start();
}
for (int i = 0; i < threads.length; i++) {
threads[i].join();
assertEquals("Hello, World!", output[i].toString().trim());
}
} finally {
parser.close();
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class ForkParserTest method testPoolSizeReached.
@Test
public void testPoolSizeReached() throws Exception {
final ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser());
try {
final Semaphore barrier = new Semaphore(0);
Thread[] threads = new Thread[parser.getPoolSize()];
PipedOutputStream[] pipes = new PipedOutputStream[threads.length];
final ParseContext context = new ParseContext();
for (int i = 0; i < threads.length; i++) {
final PipedInputStream input = new PipedInputStream() {
@Override
public synchronized int read() throws IOException {
barrier.release();
return super.read();
}
};
pipes[i] = new PipedOutputStream(input);
threads[i] = new Thread() {
public void run() {
try {
ContentHandler o = new DefaultHandler();
parser.parse(input, o, new Metadata(), context);
} catch (Exception e) {
e.printStackTrace();
}
}
};
threads[i].start();
}
// Wait until all the background parsers have been started
barrier.acquire(parser.getPoolSize());
final ContentHandler o = new BodyContentHandler();
Thread blocked = new Thread() {
public void run() {
try {
barrier.release();
InputStream stream = new ByteArrayInputStream(new byte[0]);
parser.parse(stream, o, new Metadata(), context);
} catch (Exception e) {
e.printStackTrace();
}
}
};
blocked.start();
// Wait until the last thread is started, and then some to
// make sure that it would have had a chance to start processing
// data had it not been blocked.
barrier.acquire();
Thread.sleep(1000);
assertEquals("", o.toString());
for (int i = 0; i < threads.length; i++) {
pipes[i].close();
threads[i].join();
}
blocked.join();
assertEquals("Hello, World!", o.toString().trim());
} finally {
parser.close();
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class Tika method parseToString.
/**
* Parses the given document and returns the extracted text content.
* The given input stream is closed by this method. This method lets
* you control the maxStringLength per call.
* <p>
* To avoid unpredictable excess memory use, the returned string contains
* only up to maxLength (parameter) first characters extracted
* from the input document.
* <p>
* <strong>NOTE:</strong> Unlike most other Tika methods that take an
* {@link InputStream}, this method will close the given stream for
* you as a convenience. With other methods you are still responsible
* for closing the stream or a wrapper instance returned by Tika.
*
* @param stream the document to be parsed
* @param metadata document metadata
* @param maxLength maximum length of the returned string
* @return extracted text content
* @throws IOException if the document can not be read
* @throws TikaException if the document can not be parsed
*/
public String parseToString(InputStream stream, Metadata metadata, int maxLength) throws IOException, TikaException {
WriteOutContentHandler handler = new WriteOutContentHandler(maxLength);
try {
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
parser.parse(stream, new BodyContentHandler(handler), metadata, context);
} catch (SAXException e) {
if (!handler.isWriteLimitReached(e)) {
// This should never happen with BodyContentHandler...
throw new TikaException("Unexpected SAX processing failure", e);
}
} finally {
stream.close();
}
return handler.toString();
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class BundleIT method testForkParser.
@Test
public void testForkParser() throws Exception {
ForkParser parser = new ForkParser(Activator.class.getClassLoader(), defaultParser);
String data = "<!DOCTYPE html>\n<html><body><p>test <span>content</span></p></body></html>";
InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
Writer writer = new StringWriter();
ContentHandler contentHandler = new BodyContentHandler(writer);
Metadata metadata = new Metadata();
MediaType type = contentTypeDetector.detect(stream, metadata);
assertEquals(type.toString(), "text/html");
metadata.add(Metadata.CONTENT_TYPE, type.toString());
ParseContext parseCtx = new ParseContext();
parser.parse(stream, contentHandler, metadata, parseCtx);
writer.flush();
String content = writer.toString();
assertTrue(content.length() > 0);
assertEquals("test content", content.trim());
}
Aggregations