Search in sources :

Example 1 with DataStoreTextWriter

use of org.apache.jackrabbit.oak.plugins.index.datastore.DataStoreTextWriter in project jackrabbit-oak by apache.

the class TextExtractorMain method main.

public static void main(String[] args) throws Exception {
    Closer closer = Closer.create();
    String h = "tika [extract|report|generate]\n" + "\n" + "report   : Generates a summary report related to binary data\n" + "extract  : Performs the text extraction\n" + "generate : Generates the csv data file based on configured NodeStore/BlobStore";
    try {
        OptionParser parser = new OptionParser();
        OptionSpec<?> help = parser.acceptsAll(asList("h", "?", "help"), "show help").forHelp();
        OptionSpec<String> nodeStoreSpec = parser.accepts("nodestore", "NodeStore detail /path/to/oak/repository | mongodb://host:port/database").withRequiredArg().ofType(String.class);
        OptionSpec<String> pathSpec = parser.accepts("path", "Path in repository under which the binaries would be searched").withRequiredArg().ofType(String.class);
        OptionSpec<File> dataFileSpec = parser.accepts("data-file", "Data file in csv format containing the binary metadata").withRequiredArg().ofType(File.class);
        OptionSpec<File> tikaConfigSpec = parser.accepts("tika-config", "Tika config file path").withRequiredArg().ofType(File.class);
        OptionSpec<File> fdsDirSpec = parser.accepts("fds-path", "Path of directory used by FileDataStore").withRequiredArg().ofType(File.class);
        OptionSpec<File> s3ConfigSpec = parser.accepts("s3-config-path", "Path of properties file containing config for S3DataStore").withRequiredArg().ofType(File.class);
        OptionSpec<File> storeDirSpec = parser.accepts("store-path", "Path of directory used to store extracted text content").withRequiredArg().ofType(File.class);
        OptionSpec<Integer> poolSize = parser.accepts("pool-size", "Size of the thread pool used to perform text extraction. Defaults " + "to number of cores on the system").withRequiredArg().ofType(Integer.class);
        OptionSpec<String> nonOption = parser.nonOptions(h);
        OptionSet options = parser.parse(args);
        List<String> nonOptions = nonOption.values(options);
        if (options.has(help)) {
            parser.printHelpOn(System.out);
            System.exit(0);
        }
        if (nonOptions.isEmpty()) {
            parser.printHelpOn(System.err);
            System.exit(1);
        }
        boolean report = nonOptions.contains("report");
        boolean extract = nonOptions.contains("extract");
        boolean generate = nonOptions.contains("generate");
        File dataFile = null;
        File storeDir = null;
        File tikaConfigFile = null;
        BlobStore blobStore = null;
        BinaryResourceProvider binaryResourceProvider = null;
        BinaryStats stats = null;
        String path = "/";
        if (options.has(tikaConfigSpec)) {
            tikaConfigFile = tikaConfigSpec.value(options);
            checkArgument(tikaConfigFile.exists(), "Tika config file %s does not exist", tikaConfigFile.getAbsolutePath());
        }
        if (options.has(storeDirSpec)) {
            storeDir = storeDirSpec.value(options);
            if (storeDir.exists()) {
                checkArgument(storeDir.isDirectory(), "Path [%s] specified for storing extracted " + "text content '%s' is not a directory", storeDir.getAbsolutePath(), storeDirSpec.options());
            }
        }
        if (options.has(fdsDirSpec)) {
            File fdsDir = fdsDirSpec.value(options);
            checkArgument(fdsDir.exists(), "FileDataStore %s does not exist", fdsDir.getAbsolutePath());
            FileDataStore fds = new FileDataStore();
            fds.setPath(fdsDir.getAbsolutePath());
            fds.init(null);
            blobStore = new DataStoreBlobStore(fds);
        }
        if (options.has(s3ConfigSpec)) {
            File s3Config = s3ConfigSpec.value(options);
            checkArgument(s3Config.exists() && s3Config.canRead(), "S3DataStore config cannot be read from [%s]", s3Config.getAbsolutePath());
            Properties props = loadProperties(s3Config);
            log.info("Loaded properties for S3DataStore from {}", s3Config.getAbsolutePath());
            String pathProp = "path";
            String repoPath = props.getProperty(pathProp);
            checkNotNull(repoPath, "Missing required property [%s] from S3DataStore config loaded from [%s]", pathProp, s3Config);
            //Check if 'secret' key is defined. It should be non null for references
            //to be generated. As the ref are transient we can just use any random value
            //if not specified
            String secretConfig = "secret";
            if (props.getProperty(secretConfig) == null) {
                props.setProperty(secretConfig, UUID.randomUUID().toString());
            }
            log.info("Using {} for S3DataStore ", repoPath);
            DataStore ds = createS3DataStore(props);
            PropertiesUtil.populate(ds, toMap(props), false);
            ds.init(pathProp);
            blobStore = new DataStoreBlobStore(ds);
            closer.register(asCloseable(ds));
        }
        if (options.has(dataFileSpec)) {
            dataFile = dataFileSpec.value(options);
        }
        checkNotNull(dataFile, "Data file not configured with %s", dataFileSpec);
        if (report || extract) {
            checkArgument(dataFile.exists(), "Data file %s does not exist", dataFile.getAbsolutePath());
            binaryResourceProvider = new CSVFileBinaryResourceProvider(dataFile, blobStore);
            if (binaryResourceProvider instanceof Closeable) {
                closer.register((Closeable) binaryResourceProvider);
            }
            stats = new BinaryStats(tikaConfigFile, binaryResourceProvider);
            String summary = stats.getSummary();
            log.info(summary);
        }
        if (generate) {
            String src = nodeStoreSpec.value(options);
            checkNotNull(blobStore, "BlobStore found to be null. FileDataStore directory " + "must be specified via %s", fdsDirSpec.options());
            checkNotNull(dataFile, "Data file path not provided");
            NodeStore nodeStore = bootStrapNodeStore(src, blobStore, closer);
            BinaryResourceProvider brp = new NodeStoreBinaryResourceProvider(nodeStore, blobStore);
            CSVFileGenerator generator = new CSVFileGenerator(dataFile);
            generator.generate(brp.getBinaries(path));
        }
        if (extract) {
            checkNotNull(storeDir, "Directory to store extracted text content " + "must be specified via %s", storeDirSpec.options());
            checkNotNull(blobStore, "BlobStore found to be null. FileDataStore directory " + "must be specified via %s", fdsDirSpec.options());
            DataStoreTextWriter writer = new DataStoreTextWriter(storeDir, false);
            TextExtractor extractor = new TextExtractor(writer);
            if (options.has(poolSize)) {
                extractor.setThreadPoolSize(poolSize.value(options));
            }
            if (tikaConfigFile != null) {
                extractor.setTikaConfig(tikaConfigFile);
            }
            if (options.has(pathSpec)) {
                path = pathSpec.value(options);
            }
            closer.register(writer);
            closer.register(extractor);
            extractor.setStats(stats);
            log.info("Using path {}", path);
            extractor.extract(binaryResourceProvider.getBinaries(path));
            extractor.close();
            writer.close();
        }
    } catch (Throwable e) {
        throw closer.rethrow(e);
    } finally {
        closer.close();
    }
}
Also used : Closeable(java.io.Closeable) Properties(java.util.Properties) OptionParser(joptsimple.OptionParser) DocumentNodeStore(org.apache.jackrabbit.oak.plugins.document.DocumentNodeStore) NodeStore(org.apache.jackrabbit.oak.spi.state.NodeStore) S3DataStore(org.apache.jackrabbit.aws.ext.ds.S3DataStore) DataStore(org.apache.jackrabbit.core.data.DataStore) FileDataStore(org.apache.jackrabbit.core.data.FileDataStore) FileDataStore(org.apache.jackrabbit.core.data.FileDataStore) DataStoreBlobStore(org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreBlobStore) BlobStore(org.apache.jackrabbit.oak.spi.blob.BlobStore) Closer(com.google.common.io.Closer) DataStoreTextWriter(org.apache.jackrabbit.oak.plugins.index.datastore.DataStoreTextWriter) OptionSet(joptsimple.OptionSet) File(java.io.File) DataStoreBlobStore(org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreBlobStore)

Aggregations

Closer (com.google.common.io.Closer)1 Closeable (java.io.Closeable)1 File (java.io.File)1 Properties (java.util.Properties)1 OptionParser (joptsimple.OptionParser)1 OptionSet (joptsimple.OptionSet)1 S3DataStore (org.apache.jackrabbit.aws.ext.ds.S3DataStore)1 DataStore (org.apache.jackrabbit.core.data.DataStore)1 FileDataStore (org.apache.jackrabbit.core.data.FileDataStore)1 DataStoreBlobStore (org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreBlobStore)1 DocumentNodeStore (org.apache.jackrabbit.oak.plugins.document.DocumentNodeStore)1 DataStoreTextWriter (org.apache.jackrabbit.oak.plugins.index.datastore.DataStoreTextWriter)1 BlobStore (org.apache.jackrabbit.oak.spi.blob.BlobStore)1 NodeStore (org.apache.jackrabbit.oak.spi.state.NodeStore)1