Search in sources :

Example 6 with CommonsDigester

use of org.apache.tika.parser.utils.CommonsDigester in project tika by apache.

the class DigestingParserTest method checkMulti.

private void checkMulti(Metadata truth, Path tmp, int fileLength, int markLimit, boolean useTikaInputStream, CommonsDigester.DigestAlgorithm... algos) throws IOException {
    Metadata result = new Metadata();
    CommonsDigester digester = new CommonsDigester(markLimit, algos);
    try (InputStream is = useTikaInputStream ? TikaInputStream.get(tmp) : new BufferedInputStream(Files.newInputStream(tmp))) {
        digester.digest(is, result, new ParseContext());
    }
    for (CommonsDigester.DigestAlgorithm algo : algos) {
        String truthValue = truth.get(P + algo.name());
        String resultValue = result.get(P + algo.name());
        assertNotNull("truth", truthValue);
        assertNotNull("result (fileLength=" + fileLength + ", markLimit=" + markLimit + ")", resultValue);
        assertEquals("fileLength(" + fileLength + ") markLimit(" + markLimit + ") useTikaInputStream(" + useTikaInputStream + ")" + "algorithm(" + algo.name() + ") seed(" + SEED + ")", truthValue, resultValue);
    }
}
Also used : BufferedInputStream(java.io.BufferedInputStream) BufferedInputStream(java.io.BufferedInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) CommonsDigester(org.apache.tika.parser.utils.CommonsDigester)

Example 7 with CommonsDigester

use of org.apache.tika.parser.utils.CommonsDigester in project tika by apache.

the class RecursiveParserWrapperTest method testDigesters.

@Test
public void testDigesters() throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
    List<Metadata> list = getMetadata(metadata, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true, new CommonsDigester(100000, CommonsDigester.DigestAlgorithm.MD5));
    int i = 0;
    Metadata m0 = list.get(0);
    Metadata m6 = list.get(6);
    String md5Key = "X-TIKA:digest:MD5";
    assertEquals("59f626e09a8c16ab6dbc2800c685f772", list.get(0).get(md5Key));
    assertEquals("ccdf3882e7e4c2454e28884db9b0a54d", list.get(6).get(md5Key));
    assertEquals("a869bf6432ebd14e19fc79416274e0c9", list.get(7).get(md5Key));
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) CommonsDigester(org.apache.tika.parser.utils.CommonsDigester) Test(org.junit.Test)

Example 8 with CommonsDigester

use of org.apache.tika.parser.utils.CommonsDigester in project tika by apache.

the class TikaCLI method process.

public void process(String arg) throws Exception {
    if (arg.equals("-?") || arg.equals("--help")) {
        pipeMode = false;
        usage();
    } else if (arg.equals("-V") || arg.equals("--version")) {
        pipeMode = false;
        version();
    } else if (arg.equals("-v") || arg.equals("--verbose")) {
        org.apache.log4j.Logger.getRootLogger().setLevel(Level.DEBUG);
    } else if (arg.equals("-g") || arg.equals("--gui")) {
        pipeMode = false;
        if (configFilePath != null) {
            TikaGUI.main(new String[] { configFilePath });
        } else {
            TikaGUI.main(new String[0]);
        }
    } else if (arg.equals("--list-parser") || arg.equals("--list-parsers")) {
        pipeMode = false;
        displayParsers(false, false);
    } else if (arg.equals("--list-detector") || arg.equals("--list-detectors")) {
        pipeMode = false;
        displayDetectors();
    } else if (arg.equals("--list-parser-detail") || arg.equals("--list-parser-details")) {
        pipeMode = false;
        displayParsers(true, false);
    } else if (arg.equals("--list-parser-detail-apt") || arg.equals("--list-parser-details-apt")) {
        pipeMode = false;
        displayParsers(true, true);
    } else if (arg.equals("--list-met-models")) {
        pipeMode = false;
        displayMetModels();
    } else if (arg.equals("--list-supported-types")) {
        pipeMode = false;
        displaySupportedTypes();
    } else if (arg.startsWith("--compare-file-magic=")) {
        pipeMode = false;
        compareFileMagic(arg.substring(arg.indexOf('=') + 1));
    } else if (arg.equals("--dump-minimal-config")) {
        pipeMode = false;
        dumpConfig(TikaConfigSerializer.Mode.MINIMAL);
    } else if (arg.equals("--dump-current-config")) {
        pipeMode = false;
        dumpConfig(TikaConfigSerializer.Mode.CURRENT);
    } else if (arg.equals("--dump-static-config")) {
        pipeMode = false;
        dumpConfig(TikaConfigSerializer.Mode.STATIC);
    } else if (arg.equals("--dump-static-full-config")) {
        pipeMode = false;
        dumpConfig(TikaConfigSerializer.Mode.STATIC_FULL);
    } else if (arg.equals("--container-aware") || arg.equals("--container-aware-detector")) {
    // ignore, as container-aware detectors are now always used
    } else if (arg.equals("-f") || arg.equals("--fork")) {
        fork = true;
    } else if (arg.startsWith("--config=")) {
        configure(arg.substring("--config=".length()));
    } else if (arg.startsWith("--digest=")) {
        CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse(arg.substring("--digest=".length()));
        digester = new CommonsDigester(MAX_MARK, algos);
        parser = new DigestingParser(parser, digester);
    } else if (arg.startsWith("-e")) {
        encoding = arg.substring("-e".length());
    } else if (arg.startsWith("--encoding=")) {
        encoding = arg.substring("--encoding=".length());
    } else if (arg.startsWith("-p") && !arg.equals("-p")) {
        password = arg.substring("-p".length());
    } else if (arg.startsWith("--password=")) {
        password = arg.substring("--password=".length());
    } else if (arg.equals("-j") || arg.equals("--json")) {
        type = JSON;
    } else if (arg.equals("-J") || arg.equals("--jsonRecursive")) {
        recursiveJSON = true;
    } else if (arg.equals("-y") || arg.equals("--xmp")) {
        type = XMP;
    } else if (arg.equals("-x") || arg.equals("--xml")) {
        type = XML;
    } else if (arg.equals("-h") || arg.equals("--html")) {
        type = HTML;
    } else if (arg.equals("-t") || arg.equals("--text")) {
        type = TEXT;
    } else if (arg.equals("-T") || arg.equals("--text-main")) {
        type = TEXT_MAIN;
    } else if (arg.equals("-m") || arg.equals("--metadata")) {
        type = METADATA;
    } else if (arg.equals("-l") || arg.equals("--language")) {
        type = LANGUAGE;
    } else if (arg.equals("-d") || arg.equals("--detect")) {
        type = DETECT;
    } else if (arg.startsWith("--extract-dir=")) {
        extractDir = new File(arg.substring("--extract-dir=".length()));
    } else if (arg.equals("-z") || arg.equals("--extract")) {
        type = NO_OUTPUT;
        context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor());
    } else if (arg.equals("-r") || arg.equals("--pretty-print")) {
        prettyPrint = true;
    } else if (arg.equals("-p") || arg.equals("--port") || arg.equals("-s") || arg.equals("--server")) {
        serverMode = true;
        pipeMode = false;
    } else if (arg.startsWith("-c")) {
        URI uri = new URI(arg.substring("-c".length()));
        parser = new NetworkParser(uri);
    } else if (arg.startsWith("--client=")) {
        URI uri = new URI(arg.substring("--client=".length()));
        parser = new NetworkParser(uri);
    } else {
        pipeMode = false;
        if (serverMode) {
            new TikaServer(Integer.parseInt(arg)).start();
        } else if (arg.equals("-")) {
            try (InputStream stream = TikaInputStream.get(new CloseShieldInputStream(System.in))) {
                type.process(stream, System.out, new Metadata());
            }
        } else {
            URL url;
            File file = new File(arg);
            if (file.isFile()) {
                url = file.toURI().toURL();
            } else {
                url = new URL(arg);
            }
            if (recursiveJSON) {
                handleRecursiveJson(url, System.out);
            } else {
                Metadata metadata = new Metadata();
                try (InputStream input = TikaInputStream.get(url, metadata)) {
                    type.process(input, System.out, metadata);
                } finally {
                    System.out.flush();
                }
            }
        }
    }
}
Also used : CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) DocumentInputStream(org.apache.poi.poifs.filesystem.DocumentInputStream) InputStream(java.io.InputStream) JsonMetadata(org.apache.tika.metadata.serialization.JsonMetadata) Metadata(org.apache.tika.metadata.Metadata) XMPMetadata(org.apache.tika.xmp.XMPMetadata) DigestingParser(org.apache.tika.parser.DigestingParser) URI(java.net.URI) NetworkParser(org.apache.tika.parser.NetworkParser) URL(java.net.URL) CommonsDigester(org.apache.tika.parser.utils.CommonsDigester) File(java.io.File) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 9 with CommonsDigester

use of org.apache.tika.parser.utils.CommonsDigester in project tika by apache.

the class TikaGUI method main.

/**
     * Main method. Sets the Swing look and feel to the operating system
     * settings, and starts the Tika GUI with an {@link AutoDetectParser}
     * instance as the default parser.
     *
     * @param args ignored
     * @throws Exception if an error occurs
     */
public static void main(String[] args) throws Exception {
    TikaConfig config = TikaConfig.getDefaultConfig();
    if (args.length > 0) {
        File configFile = new File(args[0]);
        config = new TikaConfig(configFile);
    }
    UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName());
    final TikaConfig finalConfig = config;
    SwingUtilities.invokeLater(new Runnable() {

        public void run() {
            new TikaGUI(new DigestingParser(new AutoDetectParser(finalConfig), new CommonsDigester(MAX_MARK, CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256))).setVisible(true);
        }
    });
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) CommonsDigester(org.apache.tika.parser.utils.CommonsDigester) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser) File(java.io.File)

Aggregations

CommonsDigester (org.apache.tika.parser.utils.CommonsDigester)9 Metadata (org.apache.tika.metadata.Metadata)6 Test (org.junit.Test)4 TikaTest (org.apache.tika.TikaTest)3 DigestingParser (org.apache.tika.parser.DigestingParser)3 File (java.io.File)2 InputStream (java.io.InputStream)2 BindingFactoryManager (org.apache.cxf.binding.BindingFactoryManager)2 JAXRSBindingFactory (org.apache.cxf.jaxrs.JAXRSBindingFactory)2 JAXRSServerFactoryBean (org.apache.cxf.jaxrs.JAXRSServerFactoryBean)2 TikaConfig (org.apache.tika.config.TikaConfig)2 TikaInputStream (org.apache.tika.io.TikaInputStream)2 BufferedInputStream (java.io.BufferedInputStream)1 FileInputStream (java.io.FileInputStream)1 URI (java.net.URI)1 URL (java.net.URL)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 CommandLine (org.apache.commons.cli.CommandLine)1 CommandLineParser (org.apache.commons.cli.CommandLineParser)1