Search in sources :

Example 1 with BlobStoreFixture

use of org.apache.jackrabbit.oak.run.cli.BlobStoreFixture in project jackrabbit-oak by apache.

the class TextExtractorMain method main.

public static void main(String[] args) throws Exception {
    OptionParser parser = new OptionParser();
    Options opts = new Options();
    opts.setCommandName(TikaCommandOptions.NAME);
    opts.setSummary("Provides text extraction related operations");
    opts.setConnectionString(CommonOptions.DEFAULT_CONNECTION_STRING);
    opts.registerOptionsFactory(TikaCommandOptions.FACTORY);
    // NodeStore is only required for generate command. So make it optional
    opts.parseAndConfigure(parser, args, false);
    TikaCommandOptions tikaOpts = opts.getOptionBean(TikaCommandOptions.class);
    // If generate then check that NodeStore is specified
    if (tikaOpts.generate()) {
        opts.checkNonOptions();
    }
    try (Closer closer = Closer.create()) {
        boolean report = tikaOpts.report();
        boolean extract = tikaOpts.extract();
        boolean generate = tikaOpts.generate();
        BlobStore blobStore;
        NodeStore nodeStore = null;
        File dataFile = tikaOpts.getDataFile();
        File storeDir = tikaOpts.getStoreDir();
        File tikaConfigFile = tikaOpts.getTikaConfig();
        BinaryResourceProvider binaryResourceProvider = null;
        BinaryStats stats = null;
        String path = tikaOpts.getPath();
        if (tikaConfigFile != null) {
            checkArgument(tikaConfigFile.exists(), "Tika config file %s does not exist", tikaConfigFile.getAbsolutePath());
        }
        if (storeDir != null) {
            if (storeDir.exists()) {
                checkArgument(storeDir.isDirectory(), "Path [%s] specified for storing extracted " + "text content is not a directory", storeDir.getAbsolutePath());
            }
        }
        checkNotNull(dataFile, "Data file not configured with %s", tikaOpts.getDataFileSpecOpt());
        if (!generate) {
            // For report and extract case we do not need NodeStore access so create BlobStore directly
            BlobStoreFixture blobStoreFixture = BlobStoreFixtureProvider.create(opts);
            closer.register(blobStoreFixture);
            blobStore = checkNotNull(blobStoreFixture).getBlobStore();
        } else {
            NodeStoreFixture nodeStoreFixture = NodeStoreFixtureProvider.create(opts);
            closer.register(nodeStoreFixture);
            blobStore = nodeStoreFixture.getBlobStore();
            nodeStore = nodeStoreFixture.getStore();
        }
        checkNotNull(blobStore, "This command requires an external BlobStore configured");
        if (generate) {
            checkNotNull(dataFile, "Data file path not provided");
            log.info("Generated csv data to be stored in {}", dataFile.getAbsolutePath());
            BinaryResourceProvider brp = new NodeStoreBinaryResourceProvider(nodeStore, blobStore);
            CSVFileGenerator generator = new CSVFileGenerator(dataFile);
            generator.generate(brp.getBinaries(path));
        }
        if (report || extract) {
            checkArgument(dataFile.exists(), "Data file %s does not exist", dataFile.getAbsolutePath());
            CSVFileBinaryResourceProvider csvProvider = new CSVFileBinaryResourceProvider(dataFile, blobStore);
            closer.register(csvProvider);
            binaryResourceProvider = csvProvider;
            stats = new BinaryStats(tikaConfigFile, binaryResourceProvider);
            String summary = stats.getSummary();
            log.info(summary);
        }
        if (extract) {
            checkNotNull(storeDir, "Directory to store extracted text content " + "must be specified via %s", tikaOpts.getStoreDirSpecOpt());
            checkNotNull(blobStore, "BlobStore found to be null.");
            DataStoreTextWriter writer = new DataStoreTextWriter(storeDir, false);
            TextExtractor extractor = new TextExtractor(writer);
            if (tikaOpts.isPoolSizeDefined()) {
                extractor.setThreadPoolSize(tikaOpts.getPoolSize());
            }
            if (tikaConfigFile != null) {
                extractor.setTikaConfig(tikaConfigFile);
            }
            closer.register(writer);
            closer.register(extractor);
            extractor.setStats(stats);
            log.info("Using path {}", path);
            extractor.extract(binaryResourceProvider.getBinaries(path));
            extractor.close();
            writer.close();
        }
    }
}
Also used : Closer(com.google.common.io.Closer) CommonOptions(org.apache.jackrabbit.oak.run.cli.CommonOptions) Options(org.apache.jackrabbit.oak.run.cli.Options) DataStoreTextWriter(org.apache.jackrabbit.oak.plugins.index.datastore.DataStoreTextWriter) OptionParser(joptsimple.OptionParser) NodeStore(org.apache.jackrabbit.oak.spi.state.NodeStore) BlobStoreFixture(org.apache.jackrabbit.oak.run.cli.BlobStoreFixture) NodeStoreFixture(org.apache.jackrabbit.oak.run.cli.NodeStoreFixture) File(java.io.File) BlobStore(org.apache.jackrabbit.oak.spi.blob.BlobStore)

Aggregations

Closer (com.google.common.io.Closer)1 File (java.io.File)1 OptionParser (joptsimple.OptionParser)1 DataStoreTextWriter (org.apache.jackrabbit.oak.plugins.index.datastore.DataStoreTextWriter)1 BlobStoreFixture (org.apache.jackrabbit.oak.run.cli.BlobStoreFixture)1 CommonOptions (org.apache.jackrabbit.oak.run.cli.CommonOptions)1 NodeStoreFixture (org.apache.jackrabbit.oak.run.cli.NodeStoreFixture)1 Options (org.apache.jackrabbit.oak.run.cli.Options)1 BlobStore (org.apache.jackrabbit.oak.spi.blob.BlobStore)1 NodeStore (org.apache.jackrabbit.oak.spi.state.NodeStore)1