use of ml.shifu.shifu.util.ValueVisitor in project shifu by ShifuML.
the class MapReduceShuffle method run.
public void run(String rawNormPath) throws IOException, ClassNotFoundException, InterruptedException {
RawSourceData.SourceType source = this.modelConfig.getDataSet().getSource();
final Configuration conf = new Configuration();
// add jars to hadoop mapper and reducer
new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars() });
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_MAP_SPECULATIVE, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_REDUCE_SPECULATIVE, true);
conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME, Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 100);
String hdpVersion = HDPUtils.getHdpVersionForHDP224();
if (StringUtils.isNotBlank(hdpVersion)) {
// for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
conf.set("hdp.version", hdpVersion);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
}
// one can set guagua conf in shifuconfig
CommonUtils.injectHadoopShifuEnvironments(new ValueVisitor() {
@Override
public void inject(Object key, Object value) {
conf.set(key.toString(), value.toString());
}
});
int shuffleSize = getDataShuffleSize(rawNormPath, source);
log.info("Try to shuffle data into - {} parts.", shuffleSize);
conf.set(Constants.SHIFU_NORM_SHUFFLE_SIZE, Integer.toString(shuffleSize));
Job job = Job.getInstance(conf, "Shifu: Shuffling normalized data - " + this.modelConfig.getModelSetName());
job.setJarByClass(getClass());
job.setMapperClass(DataShuffle.ShuffleMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
job.setPartitionerClass(DataShuffle.KvalPartitioner.class);
job.setReducerClass(DataShuffle.ShuffleReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(shuffleSize);
FileInputFormat.setInputPaths(job, rawNormPath);
FileOutputFormat.setOutputPath(job, new Path(this.pathFinder.getShuffleDataPath()));
// clean output firstly
ShifuFileUtils.deleteFile(this.pathFinder.getShuffleDataPath(), source);
// submit job
if (job.waitForCompletion(true)) {
// copy pig header and schema file at first to make sure such two files are at final output
if (ShifuFileUtils.isFileExists(new Path(rawNormPath, ".pig_header"), source)) {
ShifuFileUtils.moveTo(new Path(rawNormPath, ".pig_header").toString(), this.pathFinder.getShuffleDataPath(), source);
}
if (ShifuFileUtils.isFileExists(new Path(rawNormPath, ".pig_schema"), source)) {
ShifuFileUtils.moveTo(new Path(rawNormPath, ".pig_schema").toString(), this.pathFinder.getShuffleDataPath(), source);
}
ShifuFileUtils.deleteFile(rawNormPath, source);
ShifuFileUtils.move(this.pathFinder.getShuffleDataPath(), rawNormPath, source);
} else {
throw new RuntimeException("MapReduce Shuffle Computing Job Failed.");
}
}
use of ml.shifu.shifu.util.ValueVisitor in project shifu by ShifuML.
the class PigExecutor method submitJob.
/**
* Run the pig, Local or MapReduce mode is decide by parameter @sourceTpe
*
* @param modelConfig
* - model configuration
* @param pigScriptPath
* - path of pig script
* @param paramsMap
* - additional parameters for pig script
* @param sourceType
* - the mode run pig: pig-local/pig-hdfs
* @param confMap
* the configuration map instance
* @param pathFinder
* the path finder
* @throws IOException
* throw IOException when loading the parameter from @ModelConfig
*/
public void submitJob(ModelConfig modelConfig, String pigScriptPath, Map<String, String> paramsMap, SourceType sourceType, Map<String, String> confMap, PathFinder pathFinder) throws IOException {
// Run Pig Scripts
final PigServer pigServer = createPigServer(sourceType);
CommonUtils.injectHadoopShifuEnvironments(new ValueVisitor() {
@Override
public void inject(Object key, Object value) {
pigServer.getPigContext().getProperties().put(key, value);
}
});
if (confMap != null) {
for (Map.Entry<String, String> entry : confMap.entrySet()) {
pigServer.getPigContext().getProperties().put(entry.getKey(), entry.getValue());
}
}
Map<String, String> pigParamsMap = CommonUtils.getPigParamMap(modelConfig, sourceType, pathFinder);
if (paramsMap != null) {
pigParamsMap.putAll(paramsMap);
}
log.debug("Pig submit parameters: {}", pigParamsMap);
if (new File(pigScriptPath).isAbsolute()) {
log.info("Pig script absolute path is {}", pigScriptPath);
pigServer.registerScript(pigScriptPath, pigParamsMap);
} else {
log.info("Pig script relative path is {}", pigScriptPath);
pigServer.registerScript(PigExecutor.class.getClassLoader().getResourceAsStream(pigScriptPath), pigParamsMap);
}
}
use of ml.shifu.shifu.util.ValueVisitor in project shifu by ShifuML.
the class InitModelProcessor method getCountInfoByMRJob.
private Map<Integer, Data> getCountInfoByMRJob() throws IOException, InterruptedException, ClassNotFoundException {
SourceType source = this.modelConfig.getDataSet().getSource();
final Configuration conf = new Configuration();
// add jars to hadoop mapper and reducer
new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars() });
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_MAP_SPECULATIVE, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_REDUCE_SPECULATIVE, true);
conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME, Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 5000);
conf.set(Constants.SHIFU_MODEL_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getModelConfigPath(source))).toString());
conf.set(Constants.SHIFU_COLUMN_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getColumnConfigPath(source))).toString());
conf.set(Constants.SHIFU_MODELSET_SOURCE_TYPE, source.toString());
conf.set("mapred.reduce.slowstart.completed.maps", Environment.getProperty("mapred.reduce.slowstart.completed.maps", "0.9"));
String hdpVersion = HDPUtils.getHdpVersionForHDP224();
if (StringUtils.isNotBlank(hdpVersion)) {
// for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
conf.set("hdp.version", hdpVersion);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
}
conf.setBoolean(CombineInputFormat.SHIFU_VS_SPLIT_COMBINABLE, true);
conf.setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true);
// one can set guagua conf in shifuconfig
CommonUtils.injectHadoopShifuEnvironments(new ValueVisitor() {
@Override
public void inject(Object key, Object value) {
conf.set(key.toString(), value.toString());
}
});
@SuppressWarnings("deprecation") Job job = new Job(conf, "Shifu: Column Type Auto Checking Job : " + this.modelConfig.getModelSetName());
job.setJarByClass(getClass());
job.setMapperClass(AutoTypeDistinctCountMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(CountAndFrequentItemsWritable.class);
job.setInputFormatClass(CombineInputFormat.class);
FileInputFormat.setInputPaths(job, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.modelConfig.getDataSetRawPath())));
job.setReducerClass(AutoTypeDistinctCountReducer.class);
job.setNumReduceTasks(1);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
String autoTypePath = super.getPathFinder().getAutoTypeFilePath(source);
FileOutputFormat.setOutputPath(job, new Path(autoTypePath));
// clean output firstly
ShifuFileUtils.deleteFile(autoTypePath, source);
// submit job
if (job.waitForCompletion(true)) {
long totalValidCount = job.getCounters().findCounter(Constants.SHIFU_GROUP_COUNTER, "TOTAL_VALID_COUNT").getValue();
long invalidTagCount = job.getCounters().findCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").getValue();
long filterOut = job.getCounters().findCounter(Constants.SHIFU_GROUP_COUNTER, "FILTER_OUT_COUNT").getValue();
log.info("Total valid records {}, invalid tag records {}, filter out records {}", totalValidCount, invalidTagCount, filterOut);
if (totalValidCount > 0L && invalidTagCount * 1d / totalValidCount >= 0.8d) {
log.error("Too many invalid tags, please check you configuration on positive tags and negative tags.");
}
return getCountInfoMap(source, autoTypePath);
} else {
throw new RuntimeException("MapReduce Job Auto Type Distinct Count failed.");
}
}
use of ml.shifu.shifu.util.ValueVisitor in project shifu by ShifuML.
the class MapReducerStatsWorker method prepareJobConf.
private void prepareJobConf(RawSourceData.SourceType source, final Configuration conf, String filePath) throws IOException {
// add jars to hadoop mapper and reducer
new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars(), "-files", filePath });
conf.setBoolean(CombineInputFormat.SHIFU_VS_SPLIT_COMBINABLE, true);
conf.setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true);
conf.set(Constants.SHIFU_STATS_EXLCUDE_MISSING, Environment.getProperty(Constants.SHIFU_STATS_EXLCUDE_MISSING, "true"));
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_MAP_SPECULATIVE, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_REDUCE_SPECULATIVE, true);
conf.set(Constants.SHIFU_MODEL_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(this.pathFinder.getModelConfigPath(source))).toString());
conf.set(Constants.SHIFU_COLUMN_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(this.pathFinder.getColumnConfigPath(source))).toString());
conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME, Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
conf.set(Constants.SHIFU_MODELSET_SOURCE_TYPE, source.toString());
// set mapreduce.job.max.split.locations to 30 to suppress warnings
conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 5000);
conf.set("mapred.reduce.slowstart.completed.maps", Environment.getProperty("mapred.reduce.slowstart.completed.maps", "0.8"));
conf.set(Constants.SHIFU_STATS_FILTER_EXPRESSIONS, super.modelConfig.getSegmentFilterExpressionsAsString());
log.info("segment expressions is {}", super.modelConfig.getSegmentFilterExpressionsAsString());
String hdpVersion = HDPUtils.getHdpVersionForHDP224();
if (StringUtils.isNotBlank(hdpVersion)) {
// for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
conf.set("hdp.version", hdpVersion);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
}
// one can set guagua conf in shifuconfig
CommonUtils.injectHadoopShifuEnvironments(new ValueVisitor() {
@Override
public void inject(Object key, Object value) {
conf.set(key.toString(), value.toString());
}
});
}
Aggregations