use of org.apache.tika.parser.ParserDecorator in project tika by apache.
the class TikaConfigTest method defaultParserWithExcludes.
/**
* TIKA-1445 It should be possible to exclude DefaultParser from
* certain types, so another parser explicitly listed will take them
*/
@Test
public void defaultParserWithExcludes() throws Exception {
try {
TikaConfig config = getConfig("TIKA-1445-default-except.xml");
CompositeParser cp = (CompositeParser) config.getParser();
List<Parser> parsers = cp.getAllComponentParsers();
Parser p;
// Will be the three parsers defined in the xml
assertEquals(3, parsers.size());
// Should have a wrapped DefaultParser, not the main DefaultParser,
// as it is excluded from handling certain classes
p = parsers.get(0);
assertTrue(p.toString(), p instanceof ParserDecorator);
assertEquals(DefaultParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
// Should have two others which claim things, which they wouldn't
// otherwise handle
p = parsers.get(1);
assertTrue(p.toString(), p instanceof ParserDecorator);
assertEquals(EmptyParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
assertEquals("hello/world", p.getSupportedTypes(null).iterator().next().toString());
p = parsers.get(2);
assertTrue(p.toString(), p instanceof ParserDecorator);
assertEquals(ErrorParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
assertEquals("fail/world", p.getSupportedTypes(null).iterator().next().toString());
} catch (TikaException e) {
fail("Unexpected TikaException: " + e);
}
}
use of org.apache.tika.parser.ParserDecorator in project tika by apache.
the class TikaParserConfigTest method testMimeExcludeInclude.
@Test
public void testMimeExcludeInclude() throws Exception {
TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
assertNotNull(config.getParser());
assertNotNull(config.getDetector());
Parser parser = config.getParser();
MediaType PDF = MediaType.application("pdf");
MediaType JPEG = MediaType.image("jpeg");
// Has two parsers
assertEquals(CompositeParser.class, parser.getClass());
CompositeParser cParser = (CompositeParser) parser;
assertEquals(2, cParser.getAllComponentParsers().size());
// Both are decorated
assertTrue(cParser.getAllComponentParsers().get(0) instanceof ParserDecorator);
assertTrue(cParser.getAllComponentParsers().get(1) instanceof ParserDecorator);
ParserDecorator p0 = (ParserDecorator) cParser.getAllComponentParsers().get(0);
ParserDecorator p1 = (ParserDecorator) cParser.getAllComponentParsers().get(1);
// DefaultParser will be wrapped with excludes
assertEquals(DefaultParser.class, p0.getWrappedParser().getClass());
assertNotContained(PDF, p0.getSupportedTypes(context));
assertContains(PDF, p0.getWrappedParser().getSupportedTypes(context));
assertNotContained(JPEG, p0.getSupportedTypes(context));
assertContains(JPEG, p0.getWrappedParser().getSupportedTypes(context));
// Will have an empty parser for PDF
assertEquals(EmptyParser.class, p1.getWrappedParser().getClass());
assertEquals(1, p1.getSupportedTypes(context).size());
assertContains(PDF, p1.getSupportedTypes(context));
assertNotContained(PDF, p1.getWrappedParser().getSupportedTypes(context));
}
use of org.apache.tika.parser.ParserDecorator in project tika by apache.
the class TikaCLI method displayParser.
private void displayParser(Parser p, boolean includeMimeTypes, boolean apt, int i) {
String decorated = null;
if (p instanceof ParserDecorator) {
ParserDecorator pd = (ParserDecorator) p;
decorated = " (Wrapped by " + pd.getDecorationName() + ")";
p = pd.getWrappedParser();
}
boolean isComposite = (p instanceof CompositeParser);
String name = p.getClass().getName();
if (apt) {
name = name.substring(0, name.lastIndexOf(".") + 1) + "{{{./api/" + name.replace(".", "/") + "}" + name.substring(name.lastIndexOf(".") + 1) + "}}";
} else if (decorated != null) {
name += decorated;
}
if ((apt && !isComposite) || !apt) {
// Don't display Composite parsers in the apt output.
System.out.println(indent(i) + ((apt) ? "* " : "") + name + (isComposite ? " (Composite Parser):" : ""));
if (apt)
System.out.println();
if (includeMimeTypes && !isComposite) {
for (MediaType mt : p.getSupportedTypes(context)) {
System.out.println(indent(i + 3) + ((apt) ? "* " : "") + mt);
if (apt)
System.out.println();
}
}
}
if (isComposite) {
Parser[] subParsers = sortParsers(invertMediaTypeMap(((CompositeParser) p).getParsers()));
for (Parser sp : subParsers) {
// Don't indent for Composites in apt.
displayParser(sp, includeMimeTypes, apt, i + ((apt) ? 0 : 3));
}
}
}
use of org.apache.tika.parser.ParserDecorator in project tika by apache.
the class EmbeddedDocumentUtil method tryToFindExistingLeafParser.
/**
* Tries to find an existing parser within the ParseContext.
* It looks inside of CompositeParsers and ParserDecorators.
* The use case is when a parser needs to parse an internal stream
* that is _part_ of the document, e.g. rtf body inside an msg.
* <p/>
* Can return <code>null</code> if the context contains no parser or
* the correct parser can't be found.
*
* @param clazz parser class to search for
* @param context
* @return
*/
public static Parser tryToFindExistingLeafParser(Class clazz, ParseContext context) {
Parser p = context.get(Parser.class);
if (equals(p, clazz)) {
return p;
}
Parser returnParser = null;
if (p != null) {
if (p instanceof ParserDecorator) {
p = ((ParserDecorator) p).getWrappedParser();
}
if (equals(p, clazz)) {
return p;
}
if (p instanceof CompositeParser) {
returnParser = findInComposite((CompositeParser) p, clazz, context);
}
}
if (returnParser != null && equals(returnParser, clazz)) {
return returnParser;
}
return null;
}
use of org.apache.tika.parser.ParserDecorator in project tika by apache.
the class TikaConfigSerializer method addParser.
private static void addParser(Mode mode, Element rootElement, Document doc, Parser parser) throws Exception {
// If the parser is decorated, is it a kind where we output the parser inside?
ParserDecorator decoration = null;
if (parser instanceof ParserDecorator) {
if (parser.getClass().getName().startsWith(ParserDecorator.class.getName() + "$")) {
decoration = ((ParserDecorator) parser);
parser = decoration.getWrappedParser();
}
}
boolean outputParser = true;
List<Parser> children = Collections.emptyList();
if (mode == Mode.CURRENT && parser instanceof DefaultParser) {
// Only output the parser, not the children
} else if (parser instanceof CompositeParser) {
children = ((CompositeParser) parser).getAllComponentParsers();
// Special case for a naked composite
if (parser.getClass().equals(CompositeParser.class)) {
outputParser = false;
}
// Special case for making Default to static
if (parser instanceof DefaultParser && (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) {
outputParser = false;
}
}
if (outputParser) {
rootElement = addParser(mode, rootElement, doc, parser, decoration);
}
for (Parser childParser : children) {
addParser(mode, rootElement, doc, childParser);
}
// TODO Parser Exclusions
}
Aggregations