use of org.apache.tika.parser.CompositeParser in project tika by apache.
the class TikaCLI method displayParser.
private void displayParser(Parser p, boolean includeMimeTypes, boolean apt, int i) {
String decorated = null;
if (p instanceof ParserDecorator) {
ParserDecorator pd = (ParserDecorator) p;
decorated = " (Wrapped by " + pd.getDecorationName() + ")";
p = pd.getWrappedParser();
}
boolean isComposite = (p instanceof CompositeParser);
String name = p.getClass().getName();
if (apt) {
name = name.substring(0, name.lastIndexOf(".") + 1) + "{{{./api/" + name.replace(".", "/") + "}" + name.substring(name.lastIndexOf(".") + 1) + "}}";
} else if (decorated != null) {
name += decorated;
}
if ((apt && !isComposite) || !apt) {
// Don't display Composite parsers in the apt output.
System.out.println(indent(i) + ((apt) ? "* " : "") + name + (isComposite ? " (Composite Parser):" : ""));
if (apt)
System.out.println();
if (includeMimeTypes && !isComposite) {
for (MediaType mt : p.getSupportedTypes(context)) {
System.out.println(indent(i + 3) + ((apt) ? "* " : "") + mt);
if (apt)
System.out.println();
}
}
}
if (isComposite) {
Parser[] subParsers = sortParsers(invertMediaTypeMap(((CompositeParser) p).getParsers()));
for (Parser sp : subParsers) {
// Don't indent for Composites in apt.
displayParser(sp, includeMimeTypes, apt, i + ((apt) ? 0 : 3));
}
}
}
use of org.apache.tika.parser.CompositeParser in project tika by apache.
the class BundleIT method testBundleParsers.
@Test
public void testBundleParsers() throws Exception {
// Get the classes found within OSGi
ServiceReference<Parser> parserRef = bc.getServiceReference(Parser.class);
DefaultParser parserService = (DefaultParser) bc.getService(parserRef);
Set<String> osgiParsers = new HashSet<>();
for (Parser p : parserService.getAllComponentParsers()) {
osgiParsers.add(p.getClass().getName());
}
// Check we did get a few, just in case...
assertTrue("Should have lots Parser names, found " + osgiParsers.size(), osgiParsers.size() > 15);
// Get the raw parsers list from the traditional service loading mechanism
CompositeParser parser = (CompositeParser) defaultParser;
Set<String> rawParsers = new HashSet<>();
for (Parser p : parser.getAllComponentParsers()) {
if (p instanceof DefaultParser) {
for (Parser pChild : ((DefaultParser) p).getAllComponentParsers()) {
rawParsers.add(pChild.getClass().getName());
}
} else {
rawParsers.add(p.getClass().getName());
}
}
assertEquals(rawParsers, osgiParsers);
}
use of org.apache.tika.parser.CompositeParser in project tika by apache.
the class EmbeddedDocumentUtil method tryToFindExistingLeafParser.
/**
* Tries to find an existing parser within the ParseContext.
* It looks inside of CompositeParsers and ParserDecorators.
* The use case is when a parser needs to parse an internal stream
* that is _part_ of the document, e.g. rtf body inside an msg.
* <p/>
* Can return <code>null</code> if the context contains no parser or
* the correct parser can't be found.
*
* @param clazz parser class to search for
* @param context
* @return
*/
public static Parser tryToFindExistingLeafParser(Class clazz, ParseContext context) {
Parser p = context.get(Parser.class);
if (equals(p, clazz)) {
return p;
}
Parser returnParser = null;
if (p != null) {
if (p instanceof ParserDecorator) {
p = ((ParserDecorator) p).getWrappedParser();
}
if (equals(p, clazz)) {
return p;
}
if (p instanceof CompositeParser) {
returnParser = findInComposite((CompositeParser) p, clazz, context);
}
}
if (returnParser != null && equals(returnParser, clazz)) {
return returnParser;
}
return null;
}
use of org.apache.tika.parser.CompositeParser in project tika by apache.
the class DumpTikaConfigExampleTest method testDump.
@Test
public void testDump() throws Exception {
DumpTikaConfigExample ex = new DumpTikaConfigExample();
for (Charset charset : new Charset[] { UTF_8, UTF_16LE }) {
for (TikaConfigSerializer.Mode mode : TikaConfigSerializer.Mode.values()) {
Writer writer = new OutputStreamWriter(new FileOutputStream(configFile), charset);
TikaConfigSerializer.serialize(TikaConfig.getDefaultConfig(), mode, writer, charset);
writer.flush();
writer.close();
TikaConfig c = new TikaConfig(configFile);
assertTrue(c.getParser().toString(), c.getParser() instanceof CompositeParser);
assertTrue(c.getDetector().toString(), c.getDetector() instanceof CompositeDetector);
CompositeParser p = (CompositeParser) c.getParser();
assertTrue("enough parsers?", p.getParsers().size() > 130);
CompositeDetector d = (CompositeDetector) c.getDetector();
assertTrue("enough detectors?", d.getDetectors().size() > 3);
//just try to load it into autodetect to make sure no errors are thrown
Parser auto = new AutoDetectParser(c);
assertNotNull(auto);
}
}
}
use of org.apache.tika.parser.CompositeParser in project tika by apache.
the class TikaConfigSerializer method addParser.
private static void addParser(Mode mode, Element rootElement, Document doc, Parser parser) throws Exception {
// If the parser is decorated, is it a kind where we output the parser inside?
ParserDecorator decoration = null;
if (parser instanceof ParserDecorator) {
if (parser.getClass().getName().startsWith(ParserDecorator.class.getName() + "$")) {
decoration = ((ParserDecorator) parser);
parser = decoration.getWrappedParser();
}
}
boolean outputParser = true;
List<Parser> children = Collections.emptyList();
if (mode == Mode.CURRENT && parser instanceof DefaultParser) {
// Only output the parser, not the children
} else if (parser instanceof CompositeParser) {
children = ((CompositeParser) parser).getAllComponentParsers();
// Special case for a naked composite
if (parser.getClass().equals(CompositeParser.class)) {
outputParser = false;
}
// Special case for making Default to static
if (parser instanceof DefaultParser && (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) {
outputParser = false;
}
}
if (outputParser) {
rootElement = addParser(mode, rootElement, doc, parser, decoration);
}
for (Parser childParser : children) {
addParser(mode, rootElement, doc, childParser);
}
// TODO Parser Exclusions
}
Aggregations