use of org.apache.jackrabbit.aws.ext.ds.S3DataStore in project jackrabbit-oak by apache.
the class TextExtractorMain method main.
public static void main(String[] args) throws Exception {
Closer closer = Closer.create();
String h = "tika [extract|report|generate]\n" + "\n" + "report : Generates a summary report related to binary data\n" + "extract : Performs the text extraction\n" + "generate : Generates the csv data file based on configured NodeStore/BlobStore";
try {
OptionParser parser = new OptionParser();
OptionSpec<?> help = parser.acceptsAll(asList("h", "?", "help"), "show help").forHelp();
OptionSpec<String> nodeStoreSpec = parser.accepts("nodestore", "NodeStore detail /path/to/oak/repository | mongodb://host:port/database").withRequiredArg().ofType(String.class);
OptionSpec<String> pathSpec = parser.accepts("path", "Path in repository under which the binaries would be searched").withRequiredArg().ofType(String.class);
OptionSpec<File> dataFileSpec = parser.accepts("data-file", "Data file in csv format containing the binary metadata").withRequiredArg().ofType(File.class);
OptionSpec<File> tikaConfigSpec = parser.accepts("tika-config", "Tika config file path").withRequiredArg().ofType(File.class);
OptionSpec<File> fdsDirSpec = parser.accepts("fds-path", "Path of directory used by FileDataStore").withRequiredArg().ofType(File.class);
OptionSpec<File> s3ConfigSpec = parser.accepts("s3-config-path", "Path of properties file containing config for S3DataStore").withRequiredArg().ofType(File.class);
OptionSpec<File> storeDirSpec = parser.accepts("store-path", "Path of directory used to store extracted text content").withRequiredArg().ofType(File.class);
OptionSpec<Integer> poolSize = parser.accepts("pool-size", "Size of the thread pool used to perform text extraction. Defaults " + "to number of cores on the system").withRequiredArg().ofType(Integer.class);
OptionSpec<String> nonOption = parser.nonOptions(h);
OptionSet options = parser.parse(args);
List<String> nonOptions = nonOption.values(options);
if (options.has(help)) {
parser.printHelpOn(System.out);
System.exit(0);
}
if (nonOptions.isEmpty()) {
parser.printHelpOn(System.err);
System.exit(1);
}
boolean report = nonOptions.contains("report");
boolean extract = nonOptions.contains("extract");
boolean generate = nonOptions.contains("generate");
File dataFile = null;
File storeDir = null;
File tikaConfigFile = null;
BlobStore blobStore = null;
BinaryResourceProvider binaryResourceProvider = null;
BinaryStats stats = null;
String path = "/";
if (options.has(tikaConfigSpec)) {
tikaConfigFile = tikaConfigSpec.value(options);
checkArgument(tikaConfigFile.exists(), "Tika config file %s does not exist", tikaConfigFile.getAbsolutePath());
}
if (options.has(storeDirSpec)) {
storeDir = storeDirSpec.value(options);
if (storeDir.exists()) {
checkArgument(storeDir.isDirectory(), "Path [%s] specified for storing extracted " + "text content '%s' is not a directory", storeDir.getAbsolutePath(), storeDirSpec.options());
}
}
if (options.has(fdsDirSpec)) {
File fdsDir = fdsDirSpec.value(options);
checkArgument(fdsDir.exists(), "FileDataStore %s does not exist", fdsDir.getAbsolutePath());
FileDataStore fds = new FileDataStore();
fds.setPath(fdsDir.getAbsolutePath());
fds.init(null);
blobStore = new DataStoreBlobStore(fds);
}
if (options.has(s3ConfigSpec)) {
File s3Config = s3ConfigSpec.value(options);
checkArgument(s3Config.exists() && s3Config.canRead(), "S3DataStore config cannot be read from [%s]", s3Config.getAbsolutePath());
Properties props = loadProperties(s3Config);
log.info("Loaded properties for S3DataStore from {}", s3Config.getAbsolutePath());
String pathProp = "path";
String repoPath = props.getProperty(pathProp);
checkNotNull(repoPath, "Missing required property [%s] from S3DataStore config loaded from [%s]", pathProp, s3Config);
//Check if 'secret' key is defined. It should be non null for references
//to be generated. As the ref are transient we can just use any random value
//if not specified
String secretConfig = "secret";
if (props.getProperty(secretConfig) == null) {
props.setProperty(secretConfig, UUID.randomUUID().toString());
}
log.info("Using {} for S3DataStore ", repoPath);
DataStore ds = createS3DataStore(props);
PropertiesUtil.populate(ds, toMap(props), false);
ds.init(pathProp);
blobStore = new DataStoreBlobStore(ds);
closer.register(asCloseable(ds));
}
if (options.has(dataFileSpec)) {
dataFile = dataFileSpec.value(options);
}
checkNotNull(dataFile, "Data file not configured with %s", dataFileSpec);
if (report || extract) {
checkArgument(dataFile.exists(), "Data file %s does not exist", dataFile.getAbsolutePath());
binaryResourceProvider = new CSVFileBinaryResourceProvider(dataFile, blobStore);
if (binaryResourceProvider instanceof Closeable) {
closer.register((Closeable) binaryResourceProvider);
}
stats = new BinaryStats(tikaConfigFile, binaryResourceProvider);
String summary = stats.getSummary();
log.info(summary);
}
if (generate) {
String src = nodeStoreSpec.value(options);
checkNotNull(blobStore, "BlobStore found to be null. FileDataStore directory " + "must be specified via %s", fdsDirSpec.options());
checkNotNull(dataFile, "Data file path not provided");
NodeStore nodeStore = bootStrapNodeStore(src, blobStore, closer);
BinaryResourceProvider brp = new NodeStoreBinaryResourceProvider(nodeStore, blobStore);
CSVFileGenerator generator = new CSVFileGenerator(dataFile);
generator.generate(brp.getBinaries(path));
}
if (extract) {
checkNotNull(storeDir, "Directory to store extracted text content " + "must be specified via %s", storeDirSpec.options());
checkNotNull(blobStore, "BlobStore found to be null. FileDataStore directory " + "must be specified via %s", fdsDirSpec.options());
DataStoreTextWriter writer = new DataStoreTextWriter(storeDir, false);
TextExtractor extractor = new TextExtractor(writer);
if (options.has(poolSize)) {
extractor.setThreadPoolSize(poolSize.value(options));
}
if (tikaConfigFile != null) {
extractor.setTikaConfig(tikaConfigFile);
}
if (options.has(pathSpec)) {
path = pathSpec.value(options);
}
closer.register(writer);
closer.register(extractor);
extractor.setStats(stats);
log.info("Using path {}", path);
extractor.extract(binaryResourceProvider.getBinaries(path));
extractor.close();
writer.close();
}
} catch (Throwable e) {
throw closer.rethrow(e);
} finally {
closer.close();
}
}
use of org.apache.jackrabbit.aws.ext.ds.S3DataStore in project jackrabbit-oak by apache.
the class S3DataStoreFactory method create.
@Override
public BlobStore create(Closer closer) throws IOException {
S3DataStore delegate = new S3DataStore();
delegate.setProperties(props);
delegate.setPath(directory);
try {
delegate.init(tempHomeDir.getPath());
} catch (RepositoryException e) {
throw new IOException(e);
}
closer.register(asCloseable(delegate, tempHomeDir));
if (ignoreMissingBlobs) {
return new SafeDataStoreBlobStore(delegate);
} else {
return new DataStoreBlobStore(delegate);
}
}
use of org.apache.jackrabbit.aws.ext.ds.S3DataStore in project jackrabbit-oak by apache.
the class TextExtractorMain method createS3DataStore.
private static DataStore createS3DataStore(Properties props) throws IOException {
S3DataStore s3ds = new S3DataStore();
s3ds.setProperties(props);
return s3ds;
}
Aggregations