use of org.apache.hadoop.util.GenericOptionsParser in project Gaffer by gchq.
the class AddElementsFromHdfsHandler method fetchElements.
private void fetchElements(final AddElementsFromHdfs operation, final AccumuloStore store) throws OperationException {
final int response;
try {
/* Parse any Hadoop arguments passed on the command line and use these to configure the Tool */
final Configuration configuration = new GenericOptionsParser(operation.getCommandLineArgs()).getConfiguration();
final AddElementsFromHdfsTool fetchTool = new AddElementsFromHdfsTool(new AccumuloAddElementsFromHdfsJobFactory(configuration), operation, store);
LOGGER.info("Running FetchElementsFromHdfsTool job");
response = ToolRunner.run(fetchTool, operation.getCommandLineArgs());
LOGGER.info("Finished running FetchElementsFromHdfsTool job");
} catch (final Exception e) {
LOGGER.error("Failed to fetch elements from HDFS: {}", e.getMessage());
throw new OperationException("Failed to fetch elements from HDFS", e);
}
if (AddElementsFromHdfsTool.SUCCESS_RESPONSE != response) {
LOGGER.error("Failed to fetch elements from HDFS. Response code was {}", response);
throw new OperationException("Failed to fetch elements from HDFS. Response code was: " + response);
}
}
use of org.apache.hadoop.util.GenericOptionsParser in project shifu by ShifuML.
the class StatsModelProcessor method runCorrMapReduceJob.
private void runCorrMapReduceJob() throws IOException, InterruptedException, ClassNotFoundException {
SourceType source = this.modelConfig.getDataSet().getSource();
final Configuration conf = new Configuration();
String modelConfigPath = ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getModelConfigPath(source))).toString();
String columnConfigPath = ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getColumnConfigPath(source))).toString();
// add jars and files to hadoop mapper and reducer
new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars(), "-files", modelConfigPath + "," + columnConfigPath });
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME, Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 5000);
conf.set(Constants.SHIFU_MODEL_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getModelConfigPath(source))).toString());
conf.set(Constants.SHIFU_COLUMN_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getColumnConfigPath(source))).toString());
conf.set(Constants.SHIFU_MODELSET_SOURCE_TYPE, source.toString());
// too many data needed to be transfered to reducer, set default completed maps to a smaller one 0.7 to start
// copy data in reducer earlier.
conf.set("mapred.reduce.slowstart.completed.maps", Environment.getProperty("mapred.reduce.slowstart.completed.maps", "0.7"));
String hdpVersion = HDPUtils.getHdpVersionForHDP224();
if (StringUtils.isNotBlank(hdpVersion)) {
// for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
conf.set("hdp.version", hdpVersion);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
}
conf.setBoolean(CombineInputFormat.SHIFU_VS_SPLIT_COMBINABLE, true);
conf.setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true);
boolean isFastCorrelation = Environment.getProperty("shifu.correlation.fast", "false").equalsIgnoreCase(Boolean.TRUE.toString());
int threads = parseThreadNum();
conf.setInt("mapreduce.map.cpu.vcores", threads);
// one can set guagua conf in shifuconfig
CommonUtils.injectHadoopShifuEnvironments(new ValueVisitor() {
@Override
public void inject(Object key, Object value) {
conf.set(key.toString(), value.toString());
}
});
// -Dmapreduce.map.java.opts=-Xmx3000M'
if (System.getProperty("mapreduce.map.memory.mb") == null || System.getProperty("mapreduce.map.java.opts") == null) {
setMapperMemory(conf, threads, isFastCorrelation);
} else {
conf.set("mapreduce.map.memory.mb", System.getProperty("mapreduce.map.memory.mb"));
conf.set("mapreduce.map.java.opts", System.getProperty("mapreduce.map.java.opts"));
log.info("Corrrelation map memory is set to {}MB from command line parameters.", System.getProperty("mapreduce.map.memory.mb"));
}
@SuppressWarnings("deprecation") Job job = new Job(conf, "Shifu: Correlation Computing Job : " + this.modelConfig.getModelSetName());
job.setJarByClass(getClass());
if (isFastCorrelation) {
job.setMapperClass(FastCorrelationMultithreadedMapper.class);
FastCorrelationMultithreadedMapper.setMapperClass(job, FastCorrelationMapper.class);
FastCorrelationMultithreadedMapper.setNumberOfThreads(job, threads);
} else {
job.setMapperClass(CorrelationMultithreadedMapper.class);
CorrelationMultithreadedMapper.setMapperClass(job, CorrelationMapper.class);
CorrelationMultithreadedMapper.setNumberOfThreads(job, threads);
}
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(CorrelationWritable.class);
job.setInputFormatClass(CombineInputFormat.class);
FileInputFormat.setInputPaths(job, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.modelConfig.getDataSetRawPath())));
job.setReducerClass(CorrelationReducer.class);
// 3000 features will be 30 reducers, 600 will be 6, much more reducer to avoid data all copied to one reducer
// especially when features over 3000, each mapper output is 700M, 400 mapper will be 280G size
job.setNumReduceTasks(this.columnConfigList.size() < 50 ? 2 : this.columnConfigList.size() / 50);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
String corrPath = super.getPathFinder().getCorrelationPath(source);
FileOutputFormat.setOutputPath(job, new Path(corrPath));
// clean output firstly
ShifuFileUtils.deleteFile(corrPath, source);
// submit job
if (job.waitForCompletion(true)) {
dumpAndCalculateCorrelationResult(source, corrPath);
} else {
throw new RuntimeException("MapReduce Correlation Computing Job failed.");
}
}
use of org.apache.hadoop.util.GenericOptionsParser in project shifu by ShifuML.
the class VarSelectModelProcessor method prepareSEJobConf.
private void prepareSEJobConf(SourceType source, final Configuration conf) throws IOException {
String modelConfigPath = ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getModelConfigPath(source))).toString();
String columnConfigPath = ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getColumnConfigPath(source))).toString();
// only the first model is sued for sensitivity analysis
String seModelPath = ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getModelsPath(), "model0." + modelConfig.getAlgorithm().toLowerCase())).toString();
String filePath = modelConfigPath + "," + columnConfigPath + "," + seModelPath;
// add jars and files to hadoop mapper and reducer
new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars(), "-files", filePath });
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_MAP_SPECULATIVE, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_REDUCE_SPECULATIVE, true);
conf.set(Constants.SHIFU_MODEL_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getModelConfigPath(source))).toString());
conf.set(Constants.SHIFU_COLUMN_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getColumnConfigPath(source))).toString());
conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME, Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
conf.set(Constants.SHIFU_MODELSET_SOURCE_TYPE, source.toString());
// set mapreduce.job.max.split.locations to 100 to suppress warnings
conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 5000);
// Tmp set to false because of some cluster by default use gzip while CombineInputFormat will split gzip file (a
// bug)
conf.setBoolean(CombineInputFormat.SHIFU_VS_SPLIT_COMBINABLE, false);
conf.setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true);
conf.set("mapred.reduce.slowstart.completed.maps", Environment.getProperty("mapred.reduce.slowstart.completed.maps", "0.9"));
conf.set(Constants.SHIFU_VARSELECT_FILTEROUT_TYPE, modelConfig.getVarSelectFilterBy());
Float filterOutRatio = this.modelConfig.getVarSelect().getFilterOutRatio();
if (filterOutRatio == null) {
log.warn("filterOutRatio in var select is not set. Using default value 0.05.");
filterOutRatio = 0.05f;
}
if (filterOutRatio.compareTo(Float.valueOf(1.0f)) >= 0) {
throw new IllegalArgumentException("WrapperRatio should be in (0, 1).");
}
conf.setFloat(Constants.SHIFU_VARSELECT_FILTEROUT_RATIO, filterOutRatio);
conf.setInt(Constants.SHIFU_VARSELECT_FILTER_NUM, this.modelConfig.getVarSelectFilterNum());
String hdpVersion = HDPUtils.getHdpVersionForHDP224();
if (StringUtils.isNotBlank(hdpVersion)) {
// for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
conf.set("hdp.version", hdpVersion);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
}
// one can set guagua conf in shifuconfig
CommonUtils.injectHadoopShifuEnvironments(new ValueVisitor() {
@Override
public void inject(Object key, Object value) {
conf.set(key.toString(), value.toString());
}
});
// no matter how the mapreduce.task.io.sort.mb is set for sensitivity job, only 1 reducer and each mapper only
// output column stats, 150MB is enough.
conf.setInt("mapreduce.task.io.sort.mb", 150);
}
use of org.apache.hadoop.util.GenericOptionsParser in project shifu by ShifuML.
the class MapReduceShuffle method run.
public void run(String rawNormPath) throws IOException, ClassNotFoundException, InterruptedException {
RawSourceData.SourceType source = this.modelConfig.getDataSet().getSource();
final Configuration conf = new Configuration();
// add jars to hadoop mapper and reducer
new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars() });
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_MAP_SPECULATIVE, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_REDUCE_SPECULATIVE, true);
conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME, Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 100);
String hdpVersion = HDPUtils.getHdpVersionForHDP224();
if (StringUtils.isNotBlank(hdpVersion)) {
// for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
conf.set("hdp.version", hdpVersion);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
}
// one can set guagua conf in shifuconfig
CommonUtils.injectHadoopShifuEnvironments(new ValueVisitor() {
@Override
public void inject(Object key, Object value) {
conf.set(key.toString(), value.toString());
}
});
int shuffleSize = getDataShuffleSize(rawNormPath, source);
log.info("Try to shuffle data into - {} parts.", shuffleSize);
conf.set(Constants.SHIFU_NORM_SHUFFLE_SIZE, Integer.toString(shuffleSize));
Job job = Job.getInstance(conf, "Shifu: Shuffling normalized data - " + this.modelConfig.getModelSetName());
job.setJarByClass(getClass());
job.setMapperClass(DataShuffle.ShuffleMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
job.setPartitionerClass(DataShuffle.KvalPartitioner.class);
job.setReducerClass(DataShuffle.ShuffleReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(shuffleSize);
FileInputFormat.setInputPaths(job, rawNormPath);
FileOutputFormat.setOutputPath(job, new Path(this.pathFinder.getShuffleDataPath()));
// clean output firstly
ShifuFileUtils.deleteFile(this.pathFinder.getShuffleDataPath(), source);
// submit job
if (job.waitForCompletion(true)) {
// copy pig header and schema file at first to make sure such two files are at final output
if (ShifuFileUtils.isFileExists(new Path(rawNormPath, ".pig_header"), source)) {
ShifuFileUtils.moveTo(new Path(rawNormPath, ".pig_header").toString(), this.pathFinder.getShuffleDataPath(), source);
}
if (ShifuFileUtils.isFileExists(new Path(rawNormPath, ".pig_schema"), source)) {
ShifuFileUtils.moveTo(new Path(rawNormPath, ".pig_schema").toString(), this.pathFinder.getShuffleDataPath(), source);
}
ShifuFileUtils.deleteFile(rawNormPath, source);
ShifuFileUtils.move(this.pathFinder.getShuffleDataPath(), rawNormPath, source);
} else {
throw new RuntimeException("MapReduce Shuffle Computing Job Failed.");
}
}
use of org.apache.hadoop.util.GenericOptionsParser in project shifu by ShifuML.
the class InitModelProcessor method getCountInfoByMRJob.
private Map<Integer, Data> getCountInfoByMRJob() throws IOException, InterruptedException, ClassNotFoundException {
SourceType source = this.modelConfig.getDataSet().getSource();
final Configuration conf = new Configuration();
// add jars to hadoop mapper and reducer
new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars() });
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_MAP_SPECULATIVE, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_REDUCE_SPECULATIVE, true);
conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME, Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 5000);
conf.set(Constants.SHIFU_MODEL_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getModelConfigPath(source))).toString());
conf.set(Constants.SHIFU_COLUMN_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getColumnConfigPath(source))).toString());
conf.set(Constants.SHIFU_MODELSET_SOURCE_TYPE, source.toString());
conf.set("mapred.reduce.slowstart.completed.maps", Environment.getProperty("mapred.reduce.slowstart.completed.maps", "0.9"));
String hdpVersion = HDPUtils.getHdpVersionForHDP224();
if (StringUtils.isNotBlank(hdpVersion)) {
// for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
conf.set("hdp.version", hdpVersion);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
}
conf.setBoolean(CombineInputFormat.SHIFU_VS_SPLIT_COMBINABLE, true);
conf.setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true);
// one can set guagua conf in shifuconfig
CommonUtils.injectHadoopShifuEnvironments(new ValueVisitor() {
@Override
public void inject(Object key, Object value) {
conf.set(key.toString(), value.toString());
}
});
@SuppressWarnings("deprecation") Job job = new Job(conf, "Shifu: Column Type Auto Checking Job : " + this.modelConfig.getModelSetName());
job.setJarByClass(getClass());
job.setMapperClass(AutoTypeDistinctCountMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(CountAndFrequentItemsWritable.class);
job.setInputFormatClass(CombineInputFormat.class);
FileInputFormat.setInputPaths(job, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.modelConfig.getDataSetRawPath())));
job.setReducerClass(AutoTypeDistinctCountReducer.class);
job.setNumReduceTasks(1);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
String autoTypePath = super.getPathFinder().getAutoTypeFilePath(source);
FileOutputFormat.setOutputPath(job, new Path(autoTypePath));
// clean output firstly
ShifuFileUtils.deleteFile(autoTypePath, source);
// submit job
if (job.waitForCompletion(true)) {
long totalValidCount = job.getCounters().findCounter(Constants.SHIFU_GROUP_COUNTER, "TOTAL_VALID_COUNT").getValue();
long invalidTagCount = job.getCounters().findCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").getValue();
long filterOut = job.getCounters().findCounter(Constants.SHIFU_GROUP_COUNTER, "FILTER_OUT_COUNT").getValue();
log.info("Total valid records {}, invalid tag records {}, filter out records {}", totalValidCount, invalidTagCount, filterOut);
if (totalValidCount > 0L && invalidTagCount * 1d / totalValidCount >= 0.8d) {
log.error("Too many invalid tags, please check you configuration on positive tags and negative tags.");
}
return getCountInfoMap(source, autoTypePath);
} else {
throw new RuntimeException("MapReduce Job Auto Type Distinct Count failed.");
}
}
Aggregations