use of org.apache.tika.parser.DigestingParser in project tika by apache.
the class TikaResource method createParser.
@SuppressWarnings("serial")
public static Parser createParser() {
final Parser parser = new AutoDetectParser(tikaConfig);
Map<MediaType, Parser> parsers = ((AutoDetectParser) parser).getParsers();
parsers.put(MediaType.APPLICATION_XML, new HtmlParser());
((AutoDetectParser) parser).setParsers(parsers);
((AutoDetectParser) parser).setFallback(new Parser() {
public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
return parser.getSupportedTypes(parseContext);
}
public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) {
throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE);
}
});
if (digester != null) {
return new DigestingParser(parser, digester);
}
return parser;
}
use of org.apache.tika.parser.DigestingParser in project tika by apache.
the class UnpackerResource method process.
private Map<String, byte[]> process(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info, boolean saveAll) throws Exception {
Metadata metadata = new Metadata();
ParseContext pc = new ParseContext();
Parser parser = TikaResource.createParser();
if (parser instanceof DigestingParser) {
//no need to digest for unwrapping
parser = ((DigestingParser) parser).getWrappedParser();
}
TikaResource.fillMetadata(parser, metadata, pc, httpHeaders.getRequestHeaders());
TikaResource.logRequest(LOG, info, metadata);
ContentHandler ch;
ByteArrayOutputStream text = new ByteArrayOutputStream();
if (saveAll) {
ch = new BodyContentHandler(new RichTextContentHandler(new OutputStreamWriter(text, UTF_8)));
} else {
ch = new DefaultHandler();
}
Map<String, byte[]> files = new HashMap<>();
MutableInt count = new MutableInt();
pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files));
TikaResource.parse(parser, LOG, info.getPath(), is, ch, metadata, pc);
if (count.intValue() == 0 && !saveAll) {
throw new WebApplicationException(Response.Status.NO_CONTENT);
}
if (saveAll) {
files.put(TEXT_FILENAME, text.toByteArray());
ByteArrayOutputStream metaStream = new ByteArrayOutputStream();
metadataToCsv(metadata, metaStream);
files.put(META_FILENAME, metaStream.toByteArray());
}
return files;
}
use of org.apache.tika.parser.DigestingParser in project tika by apache.
the class TikaCLI method configure.
private void configure(String configFilePath) throws Exception {
this.configFilePath = configFilePath;
config = new TikaConfig(new File(configFilePath));
parser = new AutoDetectParser(config);
if (digester != null) {
parser = new DigestingParser(parser, digester);
}
detector = config.getDetector();
context.set(Parser.class, parser);
}
use of org.apache.tika.parser.DigestingParser in project tika by apache.
the class DigestingAutoDetectParserFactory method getParser.
@Override
public Parser getParser(TikaConfig config) {
Parser p = new AutoDetectParser(config);
if (digester == null) {
return p;
}
DigestingParser d = new DigestingParser(p, digester);
return d;
}
use of org.apache.tika.parser.DigestingParser in project tika by apache.
the class TikaCLI method process.
public void process(String arg) throws Exception {
if (arg.equals("-?") || arg.equals("--help")) {
pipeMode = false;
usage();
} else if (arg.equals("-V") || arg.equals("--version")) {
pipeMode = false;
version();
} else if (arg.equals("-v") || arg.equals("--verbose")) {
org.apache.log4j.Logger.getRootLogger().setLevel(Level.DEBUG);
} else if (arg.equals("-g") || arg.equals("--gui")) {
pipeMode = false;
if (configFilePath != null) {
TikaGUI.main(new String[] { configFilePath });
} else {
TikaGUI.main(new String[0]);
}
} else if (arg.equals("--list-parser") || arg.equals("--list-parsers")) {
pipeMode = false;
displayParsers(false, false);
} else if (arg.equals("--list-detector") || arg.equals("--list-detectors")) {
pipeMode = false;
displayDetectors();
} else if (arg.equals("--list-parser-detail") || arg.equals("--list-parser-details")) {
pipeMode = false;
displayParsers(true, false);
} else if (arg.equals("--list-parser-detail-apt") || arg.equals("--list-parser-details-apt")) {
pipeMode = false;
displayParsers(true, true);
} else if (arg.equals("--list-met-models")) {
pipeMode = false;
displayMetModels();
} else if (arg.equals("--list-supported-types")) {
pipeMode = false;
displaySupportedTypes();
} else if (arg.startsWith("--compare-file-magic=")) {
pipeMode = false;
compareFileMagic(arg.substring(arg.indexOf('=') + 1));
} else if (arg.equals("--dump-minimal-config")) {
pipeMode = false;
dumpConfig(TikaConfigSerializer.Mode.MINIMAL);
} else if (arg.equals("--dump-current-config")) {
pipeMode = false;
dumpConfig(TikaConfigSerializer.Mode.CURRENT);
} else if (arg.equals("--dump-static-config")) {
pipeMode = false;
dumpConfig(TikaConfigSerializer.Mode.STATIC);
} else if (arg.equals("--dump-static-full-config")) {
pipeMode = false;
dumpConfig(TikaConfigSerializer.Mode.STATIC_FULL);
} else if (arg.equals("--container-aware") || arg.equals("--container-aware-detector")) {
// ignore, as container-aware detectors are now always used
} else if (arg.equals("-f") || arg.equals("--fork")) {
fork = true;
} else if (arg.startsWith("--config=")) {
configure(arg.substring("--config=".length()));
} else if (arg.startsWith("--digest=")) {
CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse(arg.substring("--digest=".length()));
digester = new CommonsDigester(MAX_MARK, algos);
parser = new DigestingParser(parser, digester);
} else if (arg.startsWith("-e")) {
encoding = arg.substring("-e".length());
} else if (arg.startsWith("--encoding=")) {
encoding = arg.substring("--encoding=".length());
} else if (arg.startsWith("-p") && !arg.equals("-p")) {
password = arg.substring("-p".length());
} else if (arg.startsWith("--password=")) {
password = arg.substring("--password=".length());
} else if (arg.equals("-j") || arg.equals("--json")) {
type = JSON;
} else if (arg.equals("-J") || arg.equals("--jsonRecursive")) {
recursiveJSON = true;
} else if (arg.equals("-y") || arg.equals("--xmp")) {
type = XMP;
} else if (arg.equals("-x") || arg.equals("--xml")) {
type = XML;
} else if (arg.equals("-h") || arg.equals("--html")) {
type = HTML;
} else if (arg.equals("-t") || arg.equals("--text")) {
type = TEXT;
} else if (arg.equals("-T") || arg.equals("--text-main")) {
type = TEXT_MAIN;
} else if (arg.equals("-m") || arg.equals("--metadata")) {
type = METADATA;
} else if (arg.equals("-l") || arg.equals("--language")) {
type = LANGUAGE;
} else if (arg.equals("-d") || arg.equals("--detect")) {
type = DETECT;
} else if (arg.startsWith("--extract-dir=")) {
extractDir = new File(arg.substring("--extract-dir=".length()));
} else if (arg.equals("-z") || arg.equals("--extract")) {
type = NO_OUTPUT;
context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor());
} else if (arg.equals("-r") || arg.equals("--pretty-print")) {
prettyPrint = true;
} else if (arg.equals("-p") || arg.equals("--port") || arg.equals("-s") || arg.equals("--server")) {
serverMode = true;
pipeMode = false;
} else if (arg.startsWith("-c")) {
URI uri = new URI(arg.substring("-c".length()));
parser = new NetworkParser(uri);
} else if (arg.startsWith("--client=")) {
URI uri = new URI(arg.substring("--client=".length()));
parser = new NetworkParser(uri);
} else {
pipeMode = false;
if (serverMode) {
new TikaServer(Integer.parseInt(arg)).start();
} else if (arg.equals("-")) {
try (InputStream stream = TikaInputStream.get(new CloseShieldInputStream(System.in))) {
type.process(stream, System.out, new Metadata());
}
} else {
URL url;
File file = new File(arg);
if (file.isFile()) {
url = file.toURI().toURL();
} else {
url = new URL(arg);
}
if (recursiveJSON) {
handleRecursiveJson(url, System.out);
} else {
Metadata metadata = new Metadata();
try (InputStream input = TikaInputStream.get(url, metadata)) {
type.process(input, System.out, metadata);
} finally {
System.out.flush();
}
}
}
}
}
Aggregations