Search in sources :

Example 1 with ValueVisitor

use of ml.shifu.shifu.util.ValueVisitor in project shifu by ShifuML.

the class MapReduceShuffle method run.

public void run(String rawNormPath) throws IOException, ClassNotFoundException, InterruptedException {
    RawSourceData.SourceType source = this.modelConfig.getDataSet().getSource();
    final Configuration conf = new Configuration();
    // add jars to hadoop mapper and reducer
    new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars() });
    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
    conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_MAP_SPECULATIVE, true);
    conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_REDUCE_SPECULATIVE, true);
    conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME, Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
    conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 100);
    String hdpVersion = HDPUtils.getHdpVersionForHDP224();
    if (StringUtils.isNotBlank(hdpVersion)) {
        // for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
        conf.set("hdp.version", hdpVersion);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
    }
    // one can set guagua conf in shifuconfig
    CommonUtils.injectHadoopShifuEnvironments(new ValueVisitor() {

        @Override
        public void inject(Object key, Object value) {
            conf.set(key.toString(), value.toString());
        }
    });
    int shuffleSize = getDataShuffleSize(rawNormPath, source);
    log.info("Try to shuffle data into - {} parts.", shuffleSize);
    conf.set(Constants.SHIFU_NORM_SHUFFLE_SIZE, Integer.toString(shuffleSize));
    Job job = Job.getInstance(conf, "Shifu: Shuffling normalized data - " + this.modelConfig.getModelSetName());
    job.setJarByClass(getClass());
    job.setMapperClass(DataShuffle.ShuffleMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setPartitionerClass(DataShuffle.KvalPartitioner.class);
    job.setReducerClass(DataShuffle.ShuffleReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setNumReduceTasks(shuffleSize);
    FileInputFormat.setInputPaths(job, rawNormPath);
    FileOutputFormat.setOutputPath(job, new Path(this.pathFinder.getShuffleDataPath()));
    // clean output firstly
    ShifuFileUtils.deleteFile(this.pathFinder.getShuffleDataPath(), source);
    // submit job
    if (job.waitForCompletion(true)) {
        // copy pig header and schema file at first to make sure such two files are at final output
        if (ShifuFileUtils.isFileExists(new Path(rawNormPath, ".pig_header"), source)) {
            ShifuFileUtils.moveTo(new Path(rawNormPath, ".pig_header").toString(), this.pathFinder.getShuffleDataPath(), source);
        }
        if (ShifuFileUtils.isFileExists(new Path(rawNormPath, ".pig_schema"), source)) {
            ShifuFileUtils.moveTo(new Path(rawNormPath, ".pig_schema").toString(), this.pathFinder.getShuffleDataPath(), source);
        }
        ShifuFileUtils.deleteFile(rawNormPath, source);
        ShifuFileUtils.move(this.pathFinder.getShuffleDataPath(), rawNormPath, source);
    } else {
        throw new RuntimeException("MapReduce Shuffle Computing Job Failed.");
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) ValueVisitor(ml.shifu.shifu.util.ValueVisitor) RawSourceData(ml.shifu.shifu.container.obj.RawSourceData) Job(org.apache.hadoop.mapreduce.Job) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser)

Example 2 with ValueVisitor

use of ml.shifu.shifu.util.ValueVisitor in project shifu by ShifuML.

the class PigExecutor method submitJob.

/**
 * Run the pig, Local or MapReduce mode is decide by parameter @sourceTpe
 *
 * @param modelConfig
 *            - model configuration
 * @param pigScriptPath
 *            - path of pig script
 * @param paramsMap
 *            - additional parameters for pig script
 * @param sourceType
 *            - the mode run pig: pig-local/pig-hdfs
 * @param confMap
 *            the configuration map instance
 * @param pathFinder
 *            the path finder
 * @throws IOException
 *             throw IOException when loading the parameter from @ModelConfig
 */
public void submitJob(ModelConfig modelConfig, String pigScriptPath, Map<String, String> paramsMap, SourceType sourceType, Map<String, String> confMap, PathFinder pathFinder) throws IOException {
    // Run Pig Scripts
    final PigServer pigServer = createPigServer(sourceType);
    CommonUtils.injectHadoopShifuEnvironments(new ValueVisitor() {

        @Override
        public void inject(Object key, Object value) {
            pigServer.getPigContext().getProperties().put(key, value);
        }
    });
    if (confMap != null) {
        for (Map.Entry<String, String> entry : confMap.entrySet()) {
            pigServer.getPigContext().getProperties().put(entry.getKey(), entry.getValue());
        }
    }
    Map<String, String> pigParamsMap = CommonUtils.getPigParamMap(modelConfig, sourceType, pathFinder);
    if (paramsMap != null) {
        pigParamsMap.putAll(paramsMap);
    }
    log.debug("Pig submit parameters: {}", pigParamsMap);
    if (new File(pigScriptPath).isAbsolute()) {
        log.info("Pig script absolute path is {}", pigScriptPath);
        pigServer.registerScript(pigScriptPath, pigParamsMap);
    } else {
        log.info("Pig script relative path is {}", pigScriptPath);
        pigServer.registerScript(PigExecutor.class.getClassLoader().getResourceAsStream(pigScriptPath), pigParamsMap);
    }
}
Also used : ValueVisitor(ml.shifu.shifu.util.ValueVisitor) PigServer(org.apache.pig.PigServer) Map(java.util.Map) File(java.io.File)

Example 3 with ValueVisitor

use of ml.shifu.shifu.util.ValueVisitor in project shifu by ShifuML.

the class InitModelProcessor method getCountInfoByMRJob.

private Map<Integer, Data> getCountInfoByMRJob() throws IOException, InterruptedException, ClassNotFoundException {
    SourceType source = this.modelConfig.getDataSet().getSource();
    final Configuration conf = new Configuration();
    // add jars to hadoop mapper and reducer
    new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars() });
    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
    conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_MAP_SPECULATIVE, true);
    conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_REDUCE_SPECULATIVE, true);
    conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME, Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
    conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 5000);
    conf.set(Constants.SHIFU_MODEL_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getModelConfigPath(source))).toString());
    conf.set(Constants.SHIFU_COLUMN_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getColumnConfigPath(source))).toString());
    conf.set(Constants.SHIFU_MODELSET_SOURCE_TYPE, source.toString());
    conf.set("mapred.reduce.slowstart.completed.maps", Environment.getProperty("mapred.reduce.slowstart.completed.maps", "0.9"));
    String hdpVersion = HDPUtils.getHdpVersionForHDP224();
    if (StringUtils.isNotBlank(hdpVersion)) {
        // for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
        conf.set("hdp.version", hdpVersion);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
    }
    conf.setBoolean(CombineInputFormat.SHIFU_VS_SPLIT_COMBINABLE, true);
    conf.setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true);
    // one can set guagua conf in shifuconfig
    CommonUtils.injectHadoopShifuEnvironments(new ValueVisitor() {

        @Override
        public void inject(Object key, Object value) {
            conf.set(key.toString(), value.toString());
        }
    });
    @SuppressWarnings("deprecation") Job job = new Job(conf, "Shifu: Column Type Auto Checking Job : " + this.modelConfig.getModelSetName());
    job.setJarByClass(getClass());
    job.setMapperClass(AutoTypeDistinctCountMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(CountAndFrequentItemsWritable.class);
    job.setInputFormatClass(CombineInputFormat.class);
    FileInputFormat.setInputPaths(job, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.modelConfig.getDataSetRawPath())));
    job.setReducerClass(AutoTypeDistinctCountReducer.class);
    job.setNumReduceTasks(1);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    String autoTypePath = super.getPathFinder().getAutoTypeFilePath(source);
    FileOutputFormat.setOutputPath(job, new Path(autoTypePath));
    // clean output firstly
    ShifuFileUtils.deleteFile(autoTypePath, source);
    // submit job
    if (job.waitForCompletion(true)) {
        long totalValidCount = job.getCounters().findCounter(Constants.SHIFU_GROUP_COUNTER, "TOTAL_VALID_COUNT").getValue();
        long invalidTagCount = job.getCounters().findCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").getValue();
        long filterOut = job.getCounters().findCounter(Constants.SHIFU_GROUP_COUNTER, "FILTER_OUT_COUNT").getValue();
        log.info("Total valid records {}, invalid tag records {}, filter out records {}", totalValidCount, invalidTagCount, filterOut);
        if (totalValidCount > 0L && invalidTagCount * 1d / totalValidCount >= 0.8d) {
            log.error("Too many invalid tags, please check you configuration on positive tags and negative tags.");
        }
        return getCountInfoMap(source, autoTypePath);
    } else {
        throw new RuntimeException("MapReduce Job Auto Type Distinct Count failed.");
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) ValueVisitor(ml.shifu.shifu.util.ValueVisitor) SourceType(ml.shifu.shifu.container.obj.RawSourceData.SourceType) Job(org.apache.hadoop.mapreduce.Job) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser)

Example 4 with ValueVisitor

use of ml.shifu.shifu.util.ValueVisitor in project shifu by ShifuML.

the class MapReducerStatsWorker method prepareJobConf.

private void prepareJobConf(RawSourceData.SourceType source, final Configuration conf, String filePath) throws IOException {
    // add jars to hadoop mapper and reducer
    new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars(), "-files", filePath });
    conf.setBoolean(CombineInputFormat.SHIFU_VS_SPLIT_COMBINABLE, true);
    conf.setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true);
    conf.set(Constants.SHIFU_STATS_EXLCUDE_MISSING, Environment.getProperty(Constants.SHIFU_STATS_EXLCUDE_MISSING, "true"));
    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
    conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_MAP_SPECULATIVE, true);
    conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_REDUCE_SPECULATIVE, true);
    conf.set(Constants.SHIFU_MODEL_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(this.pathFinder.getModelConfigPath(source))).toString());
    conf.set(Constants.SHIFU_COLUMN_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(this.pathFinder.getColumnConfigPath(source))).toString());
    conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME, Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
    conf.set(Constants.SHIFU_MODELSET_SOURCE_TYPE, source.toString());
    // set mapreduce.job.max.split.locations to 30 to suppress warnings
    conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 5000);
    conf.set("mapred.reduce.slowstart.completed.maps", Environment.getProperty("mapred.reduce.slowstart.completed.maps", "0.8"));
    conf.set(Constants.SHIFU_STATS_FILTER_EXPRESSIONS, super.modelConfig.getSegmentFilterExpressionsAsString());
    log.info("segment expressions is {}", super.modelConfig.getSegmentFilterExpressionsAsString());
    String hdpVersion = HDPUtils.getHdpVersionForHDP224();
    if (StringUtils.isNotBlank(hdpVersion)) {
        // for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
        conf.set("hdp.version", hdpVersion);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
    }
    // one can set guagua conf in shifuconfig
    CommonUtils.injectHadoopShifuEnvironments(new ValueVisitor() {

        @Override
        public void inject(Object key, Object value) {
            conf.set(key.toString(), value.toString());
        }
    });
}
Also used : Path(org.apache.hadoop.fs.Path) ValueVisitor(ml.shifu.shifu.util.ValueVisitor) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser)

Aggregations

ValueVisitor (ml.shifu.shifu.util.ValueVisitor)4 Path (org.apache.hadoop.fs.Path)3 GenericOptionsParser (org.apache.hadoop.util.GenericOptionsParser)3 Configuration (org.apache.hadoop.conf.Configuration)2 Job (org.apache.hadoop.mapreduce.Job)2 File (java.io.File)1 Map (java.util.Map)1 RawSourceData (ml.shifu.shifu.container.obj.RawSourceData)1 SourceType (ml.shifu.shifu.container.obj.RawSourceData.SourceType)1 PigServer (org.apache.pig.PigServer)1