use of org.apache.jackrabbit.oak.run.cli.BlobStoreFixture in project jackrabbit-oak by apache.
the class TextExtractorMain method main.
public static void main(String[] args) throws Exception {
OptionParser parser = new OptionParser();
Options opts = new Options();
opts.setCommandName(TikaCommandOptions.NAME);
opts.setSummary("Provides text extraction related operations");
opts.setConnectionString(CommonOptions.DEFAULT_CONNECTION_STRING);
opts.registerOptionsFactory(TikaCommandOptions.FACTORY);
// NodeStore is only required for generate command. So make it optional
opts.parseAndConfigure(parser, args, false);
TikaCommandOptions tikaOpts = opts.getOptionBean(TikaCommandOptions.class);
// If generate then check that NodeStore is specified
if (tikaOpts.generate()) {
opts.checkNonOptions();
}
try (Closer closer = Closer.create()) {
boolean report = tikaOpts.report();
boolean extract = tikaOpts.extract();
boolean generate = tikaOpts.generate();
BlobStore blobStore;
NodeStore nodeStore = null;
File dataFile = tikaOpts.getDataFile();
File storeDir = tikaOpts.getStoreDir();
File tikaConfigFile = tikaOpts.getTikaConfig();
BinaryResourceProvider binaryResourceProvider = null;
BinaryStats stats = null;
String path = tikaOpts.getPath();
if (tikaConfigFile != null) {
checkArgument(tikaConfigFile.exists(), "Tika config file %s does not exist", tikaConfigFile.getAbsolutePath());
}
if (storeDir != null) {
if (storeDir.exists()) {
checkArgument(storeDir.isDirectory(), "Path [%s] specified for storing extracted " + "text content is not a directory", storeDir.getAbsolutePath());
}
}
checkNotNull(dataFile, "Data file not configured with %s", tikaOpts.getDataFileSpecOpt());
if (!generate) {
// For report and extract case we do not need NodeStore access so create BlobStore directly
BlobStoreFixture blobStoreFixture = BlobStoreFixtureProvider.create(opts);
closer.register(blobStoreFixture);
blobStore = checkNotNull(blobStoreFixture).getBlobStore();
} else {
NodeStoreFixture nodeStoreFixture = NodeStoreFixtureProvider.create(opts);
closer.register(nodeStoreFixture);
blobStore = nodeStoreFixture.getBlobStore();
nodeStore = nodeStoreFixture.getStore();
}
checkNotNull(blobStore, "This command requires an external BlobStore configured");
if (generate) {
checkNotNull(dataFile, "Data file path not provided");
log.info("Generated csv data to be stored in {}", dataFile.getAbsolutePath());
BinaryResourceProvider brp = new NodeStoreBinaryResourceProvider(nodeStore, blobStore);
CSVFileGenerator generator = new CSVFileGenerator(dataFile);
generator.generate(brp.getBinaries(path));
}
if (report || extract) {
checkArgument(dataFile.exists(), "Data file %s does not exist", dataFile.getAbsolutePath());
CSVFileBinaryResourceProvider csvProvider = new CSVFileBinaryResourceProvider(dataFile, blobStore);
closer.register(csvProvider);
binaryResourceProvider = csvProvider;
stats = new BinaryStats(tikaConfigFile, binaryResourceProvider);
String summary = stats.getSummary();
log.info(summary);
}
if (extract) {
checkNotNull(storeDir, "Directory to store extracted text content " + "must be specified via %s", tikaOpts.getStoreDirSpecOpt());
checkNotNull(blobStore, "BlobStore found to be null.");
DataStoreTextWriter writer = new DataStoreTextWriter(storeDir, false);
TextExtractor extractor = new TextExtractor(writer);
if (tikaOpts.isPoolSizeDefined()) {
extractor.setThreadPoolSize(tikaOpts.getPoolSize());
}
if (tikaConfigFile != null) {
extractor.setTikaConfig(tikaConfigFile);
}
closer.register(writer);
closer.register(extractor);
extractor.setStats(stats);
log.info("Using path {}", path);
extractor.extract(binaryResourceProvider.getBinaries(path));
extractor.close();
writer.close();
}
}
}
Aggregations