use of org.apache.tika.parser.utils.CommonsDigester in project tika by apache.
the class DigestingParserTest method checkMulti.
private void checkMulti(Metadata truth, Path tmp, int fileLength, int markLimit, boolean useTikaInputStream, CommonsDigester.DigestAlgorithm... algos) throws IOException {
Metadata result = new Metadata();
CommonsDigester digester = new CommonsDigester(markLimit, algos);
try (InputStream is = useTikaInputStream ? TikaInputStream.get(tmp) : new BufferedInputStream(Files.newInputStream(tmp))) {
digester.digest(is, result, new ParseContext());
}
for (CommonsDigester.DigestAlgorithm algo : algos) {
String truthValue = truth.get(P + algo.name());
String resultValue = result.get(P + algo.name());
assertNotNull("truth", truthValue);
assertNotNull("result (fileLength=" + fileLength + ", markLimit=" + markLimit + ")", resultValue);
assertEquals("fileLength(" + fileLength + ") markLimit(" + markLimit + ") useTikaInputStream(" + useTikaInputStream + ")" + "algorithm(" + algo.name() + ") seed(" + SEED + ")", truthValue, resultValue);
}
}
use of org.apache.tika.parser.utils.CommonsDigester in project tika by apache.
the class RecursiveParserWrapperTest method testDigesters.
@Test
public void testDigesters() throws Exception {
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
List<Metadata> list = getMetadata(metadata, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true, new CommonsDigester(100000, CommonsDigester.DigestAlgorithm.MD5));
int i = 0;
Metadata m0 = list.get(0);
Metadata m6 = list.get(6);
String md5Key = "X-TIKA:digest:MD5";
assertEquals("59f626e09a8c16ab6dbc2800c685f772", list.get(0).get(md5Key));
assertEquals("ccdf3882e7e4c2454e28884db9b0a54d", list.get(6).get(md5Key));
assertEquals("a869bf6432ebd14e19fc79416274e0c9", list.get(7).get(md5Key));
}
use of org.apache.tika.parser.utils.CommonsDigester in project tika by apache.
the class TikaCLI method process.
public void process(String arg) throws Exception {
if (arg.equals("-?") || arg.equals("--help")) {
pipeMode = false;
usage();
} else if (arg.equals("-V") || arg.equals("--version")) {
pipeMode = false;
version();
} else if (arg.equals("-v") || arg.equals("--verbose")) {
org.apache.log4j.Logger.getRootLogger().setLevel(Level.DEBUG);
} else if (arg.equals("-g") || arg.equals("--gui")) {
pipeMode = false;
if (configFilePath != null) {
TikaGUI.main(new String[] { configFilePath });
} else {
TikaGUI.main(new String[0]);
}
} else if (arg.equals("--list-parser") || arg.equals("--list-parsers")) {
pipeMode = false;
displayParsers(false, false);
} else if (arg.equals("--list-detector") || arg.equals("--list-detectors")) {
pipeMode = false;
displayDetectors();
} else if (arg.equals("--list-parser-detail") || arg.equals("--list-parser-details")) {
pipeMode = false;
displayParsers(true, false);
} else if (arg.equals("--list-parser-detail-apt") || arg.equals("--list-parser-details-apt")) {
pipeMode = false;
displayParsers(true, true);
} else if (arg.equals("--list-met-models")) {
pipeMode = false;
displayMetModels();
} else if (arg.equals("--list-supported-types")) {
pipeMode = false;
displaySupportedTypes();
} else if (arg.startsWith("--compare-file-magic=")) {
pipeMode = false;
compareFileMagic(arg.substring(arg.indexOf('=') + 1));
} else if (arg.equals("--dump-minimal-config")) {
pipeMode = false;
dumpConfig(TikaConfigSerializer.Mode.MINIMAL);
} else if (arg.equals("--dump-current-config")) {
pipeMode = false;
dumpConfig(TikaConfigSerializer.Mode.CURRENT);
} else if (arg.equals("--dump-static-config")) {
pipeMode = false;
dumpConfig(TikaConfigSerializer.Mode.STATIC);
} else if (arg.equals("--dump-static-full-config")) {
pipeMode = false;
dumpConfig(TikaConfigSerializer.Mode.STATIC_FULL);
} else if (arg.equals("--container-aware") || arg.equals("--container-aware-detector")) {
// ignore, as container-aware detectors are now always used
} else if (arg.equals("-f") || arg.equals("--fork")) {
fork = true;
} else if (arg.startsWith("--config=")) {
configure(arg.substring("--config=".length()));
} else if (arg.startsWith("--digest=")) {
CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse(arg.substring("--digest=".length()));
digester = new CommonsDigester(MAX_MARK, algos);
parser = new DigestingParser(parser, digester);
} else if (arg.startsWith("-e")) {
encoding = arg.substring("-e".length());
} else if (arg.startsWith("--encoding=")) {
encoding = arg.substring("--encoding=".length());
} else if (arg.startsWith("-p") && !arg.equals("-p")) {
password = arg.substring("-p".length());
} else if (arg.startsWith("--password=")) {
password = arg.substring("--password=".length());
} else if (arg.equals("-j") || arg.equals("--json")) {
type = JSON;
} else if (arg.equals("-J") || arg.equals("--jsonRecursive")) {
recursiveJSON = true;
} else if (arg.equals("-y") || arg.equals("--xmp")) {
type = XMP;
} else if (arg.equals("-x") || arg.equals("--xml")) {
type = XML;
} else if (arg.equals("-h") || arg.equals("--html")) {
type = HTML;
} else if (arg.equals("-t") || arg.equals("--text")) {
type = TEXT;
} else if (arg.equals("-T") || arg.equals("--text-main")) {
type = TEXT_MAIN;
} else if (arg.equals("-m") || arg.equals("--metadata")) {
type = METADATA;
} else if (arg.equals("-l") || arg.equals("--language")) {
type = LANGUAGE;
} else if (arg.equals("-d") || arg.equals("--detect")) {
type = DETECT;
} else if (arg.startsWith("--extract-dir=")) {
extractDir = new File(arg.substring("--extract-dir=".length()));
} else if (arg.equals("-z") || arg.equals("--extract")) {
type = NO_OUTPUT;
context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor());
} else if (arg.equals("-r") || arg.equals("--pretty-print")) {
prettyPrint = true;
} else if (arg.equals("-p") || arg.equals("--port") || arg.equals("-s") || arg.equals("--server")) {
serverMode = true;
pipeMode = false;
} else if (arg.startsWith("-c")) {
URI uri = new URI(arg.substring("-c".length()));
parser = new NetworkParser(uri);
} else if (arg.startsWith("--client=")) {
URI uri = new URI(arg.substring("--client=".length()));
parser = new NetworkParser(uri);
} else {
pipeMode = false;
if (serverMode) {
new TikaServer(Integer.parseInt(arg)).start();
} else if (arg.equals("-")) {
try (InputStream stream = TikaInputStream.get(new CloseShieldInputStream(System.in))) {
type.process(stream, System.out, new Metadata());
}
} else {
URL url;
File file = new File(arg);
if (file.isFile()) {
url = file.toURI().toURL();
} else {
url = new URL(arg);
}
if (recursiveJSON) {
handleRecursiveJson(url, System.out);
} else {
Metadata metadata = new Metadata();
try (InputStream input = TikaInputStream.get(url, metadata)) {
type.process(input, System.out, metadata);
} finally {
System.out.flush();
}
}
}
}
}
use of org.apache.tika.parser.utils.CommonsDigester in project tika by apache.
the class TikaGUI method main.
/**
* Main method. Sets the Swing look and feel to the operating system
* settings, and starts the Tika GUI with an {@link AutoDetectParser}
* instance as the default parser.
*
* @param args ignored
* @throws Exception if an error occurs
*/
public static void main(String[] args) throws Exception {
TikaConfig config = TikaConfig.getDefaultConfig();
if (args.length > 0) {
File configFile = new File(args[0]);
config = new TikaConfig(configFile);
}
UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName());
final TikaConfig finalConfig = config;
SwingUtilities.invokeLater(new Runnable() {
public void run() {
new TikaGUI(new DigestingParser(new AutoDetectParser(finalConfig), new CommonsDigester(MAX_MARK, CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256))).setVisible(true);
}
});
}
Aggregations