Search in sources :

Example 31 with GenericOptionsParser

use of org.apache.hadoop.util.GenericOptionsParser in project shifu by ShifuML.

the class PostTrainModelProcessor method runMRFeatureImportanceJob.

private void runMRFeatureImportanceJob(SourceType source, String output) throws IOException, InterruptedException, ClassNotFoundException {
    final Configuration conf = new Configuration();
    // add jars to hadoop mapper and reducer
    new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars() });
    conf.setBoolean(CombineInputFormat.SHIFU_VS_SPLIT_COMBINABLE, true);
    conf.setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true);
    conf.set(Constants.SHIFU_STATS_EXLCUDE_MISSING, Environment.getProperty(Constants.SHIFU_STATS_EXLCUDE_MISSING, "true"));
    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
    conf.set(Constants.SHIFU_MODEL_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getModelConfigPath(source))).toString());
    conf.set(Constants.SHIFU_COLUMN_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getColumnConfigPath(source))).toString());
    conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME, Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
    conf.set(Constants.SHIFU_MODELSET_SOURCE_TYPE, source.toString());
    // set mapreduce.job.max.split.locations to 30 to suppress warnings
    conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 5000);
    conf.set("mapred.reduce.slowstart.completed.maps", Environment.getProperty("mapred.reduce.slowstart.completed.maps", "0.8"));
    String hdpVersion = HDPUtils.getHdpVersionForHDP224();
    if (StringUtils.isNotBlank(hdpVersion)) {
        // for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
        conf.set("hdp.version", hdpVersion);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
    }
    // one can set guagua conf in shifuconfig
    CommonUtils.injectHadoopShifuEnvironments(new ValueVisitor() {

        @Override
        public void inject(Object key, Object value) {
            conf.set(key.toString(), value.toString());
        }
    });
    @SuppressWarnings("deprecation") Job job = new Job(conf, "Shifu: Post Train FeatureImportance : " + this.modelConfig.getModelSetName());
    job.setJarByClass(getClass());
    job.setMapperClass(FeatureImportanceMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);
    job.setInputFormatClass(CombineInputFormat.class);
    FileInputFormat.setInputPaths(job, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.modelConfig.getDataSetRawPath())));
    job.setReducerClass(FeatureImportanceReducer.class);
    job.setNumReduceTasks(1);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(DoubleWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(output));
    // clean output firstly
    ShifuFileUtils.deleteFile(output, source);
    // submit job
    if (!job.waitForCompletion(true)) {
        throw new RuntimeException("Post train Feature Importance MapReduce job is failed.");
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Job(org.apache.hadoop.mapreduce.Job) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser)

Example 32 with GenericOptionsParser

use of org.apache.hadoop.util.GenericOptionsParser in project Cloud9 by lintool.

the class FileMerger method run.

@Override
public /**
 * TODO: add in hadoop configuration
 */
int run(String[] args) throws IOException {
    Options options = new Options();
    options.addOption(HELP_OPTION, false, "print the help message");
    options.addOption(OptionBuilder.withArgName(PATH_INDICATOR).hasArg().withDescription("input file or directory").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName(PATH_INDICATOR).hasArg().withDescription("output file").create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName(INTEGER_INDICATOR).hasArg().withDescription("number of mappers (default to 0 and hence local merge mode, set to positive value to enable cluster merge mode)").create(MAPPER_OPTION));
    options.addOption(OptionBuilder.withArgName("property=value").hasArgs(2).withValueSeparator().withDescription("assign value for given property").create("D"));
    options.addOption(TEXT_FILE_INPUT_FORMAT, false, "input file in sequence format");
    options.addOption(DELETE_SOURCE_OPTION, false, "delete sources after merging");
    int mapperTasks = 0;
    boolean deleteSource = DELETE_SOURCE;
    boolean textFileFormat = TEXT_FILE_INPUT;
    String inputPath = "";
    String outputPath = "";
    GenericOptionsParser genericOptionsParser = new GenericOptionsParser(args);
    Configuration configuration = genericOptionsParser.getConfiguration();
    CommandLineParser parser = new GnuParser();
    HelpFormatter formatter = new HelpFormatter();
    try {
        CommandLine line = parser.parse(options, args);
        if (line.hasOption(HELP_OPTION)) {
            formatter.printHelp(FileMerger.class.getName(), options);
            System.exit(0);
        }
        if (line.hasOption(INPUT_OPTION)) {
            inputPath = line.getOptionValue(INPUT_OPTION);
        } else {
            throw new ParseException("Parsing failed due to " + INPUT_OPTION + " not initialized...");
        }
        if (line.hasOption(OUTPUT_OPTION)) {
            outputPath = line.getOptionValue(OUTPUT_OPTION);
        } else {
            throw new ParseException("Parsing failed due to " + OUTPUT_OPTION + " not initialized...");
        }
        if (line.hasOption(MAPPER_OPTION)) {
            mapperTasks = Integer.parseInt(line.getOptionValue(MAPPER_OPTION));
            if (mapperTasks <= 0) {
                sLogger.info("Warning: " + MAPPER_OPTION + " is not positive, merge in local model...");
                mapperTasks = 0;
            }
        }
        if (line.hasOption(DELETE_SOURCE_OPTION)) {
            deleteSource = true;
        }
        if (line.hasOption(TEXT_FILE_INPUT_FORMAT)) {
            textFileFormat = true;
        }
    } catch (ParseException pe) {
        System.err.println(pe.getMessage());
        formatter.printHelp(FileMerger.class.getName(), options);
        System.exit(0);
    } catch (NumberFormatException nfe) {
        System.err.println(nfe.getMessage());
        System.exit(0);
    }
    try {
        merge(configuration, inputPath, outputPath, mapperTasks, textFileFormat, deleteSource);
    } catch (InstantiationException ie) {
        ie.printStackTrace();
    } catch (IllegalAccessException iae) {
        iae.printStackTrace();
    }
    return 0;
}
Also used : Options(org.apache.commons.cli.Options) Configuration(org.apache.hadoop.conf.Configuration) GnuParser(org.apache.commons.cli.GnuParser) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser)

Example 33 with GenericOptionsParser

use of org.apache.hadoop.util.GenericOptionsParser in project hive by apache.

the class WriteRC method run.

public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    args = new GenericOptionsParser(conf, args).getRemainingArgs();
    String serverUri = args[0];
    String inputTableName = args[1];
    String outputTableName = args[2];
    String dbName = null;
    String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL);
    if (principalID != null)
        conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID);
    Job job = new Job(conf, "WriteRC");
    HCatInputFormat.setInput(job, dbName, inputTableName);
    // initialize HCatOutputFormat
    job.setInputFormatClass(HCatInputFormat.class);
    job.setJarByClass(WriteRC.class);
    job.setMapperClass(Map.class);
    job.setOutputKeyClass(WritableComparable.class);
    job.setOutputValueClass(DefaultHCatRecord.class);
    job.setNumReduceTasks(0);
    HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, outputTableName, null));
    HCatSchema s = HCatInputFormat.getTableSchema(job);
    System.err.println("INFO: output schema explicitly set for writing:" + s);
    HCatOutputFormat.setSchema(job, s);
    job.setOutputFormatClass(HCatOutputFormat.class);
    return (job.waitForCompletion(true) ? 0 : 1);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) Job(org.apache.hadoop.mapreduce.Job) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser)

Example 34 with GenericOptionsParser

use of org.apache.hadoop.util.GenericOptionsParser in project hive by apache.

the class WriteTextPartitioned method run.

public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    args = new GenericOptionsParser(conf, args).getRemainingArgs();
    String serverUri = args[0];
    String inputTableName = args[1];
    String outputTableName = args[2];
    if (args.length > 3)
        filter = args[3];
    String dbName = null;
    String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL);
    if (principalID != null)
        conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID);
    Job job = new Job(conf, "WriteTextPartitioned");
    HCatInputFormat.setInput(job, dbName, inputTableName, filter);
    // initialize HCatOutputFormat
    job.setInputFormatClass(HCatInputFormat.class);
    job.setJarByClass(WriteTextPartitioned.class);
    job.setMapperClass(Map.class);
    job.setOutputKeyClass(WritableComparable.class);
    job.setOutputValueClass(DefaultHCatRecord.class);
    job.setNumReduceTasks(0);
    java.util.Map<String, String> partitionVals = null;
    if (filter != null) {
        String[] s = filter.split("=");
        String val = s[1].replace('"', ' ').trim();
        partitionVals = new HashMap<String, String>(1);
        partitionVals.put(s[0], val);
    }
    HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, outputTableName, partitionVals));
    HCatSchema s = HCatInputFormat.getTableSchema(job);
    // Build the schema for this table, which is slightly different than the
    // schema for the input table
    List<HCatFieldSchema> fss = new ArrayList<HCatFieldSchema>(3);
    fss.add(s.get(0));
    fss.add(s.get(1));
    fss.add(s.get(3));
    HCatOutputFormat.setSchema(job, new HCatSchema(fss));
    job.setOutputFormatClass(HCatOutputFormat.class);
    return (job.waitForCompletion(true) ? 0 : 1);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) ArrayList(java.util.ArrayList) Job(org.apache.hadoop.mapreduce.Job) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema)

Example 35 with GenericOptionsParser

use of org.apache.hadoop.util.GenericOptionsParser in project hive by apache.

the class StoreNumbers method main.

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    args = new GenericOptionsParser(conf, args).getRemainingArgs();
    String[] otherArgs = new String[2];
    int j = 0;
    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-libjars")) {
            // generic options parser doesn't seem to work!
            conf.set("tmpjars", args[i + 1]);
            // skip it , the for loop will skip its value
            i = i + 1;
        } else {
            otherArgs[j++] = args[i];
        }
    }
    if (otherArgs.length != 2) {
        usage();
    }
    String serverUri = otherArgs[0];
    if (otherArgs[1] == null || (!otherArgs[1].equalsIgnoreCase("part") && !otherArgs[1].equalsIgnoreCase("nopart")) && !otherArgs[1].equalsIgnoreCase("nopart_pig")) {
        usage();
    }
    boolean writeToPartitionedTable = (otherArgs[1].equalsIgnoreCase("part"));
    boolean writeToNonPartPigTable = (otherArgs[1].equalsIgnoreCase("nopart_pig"));
    String tableName = NUMBERS_TABLE_NAME;
    String dbName = "default";
    Map<String, String> outputPartitionKvps = new HashMap<String, String>();
    String outputTableName = null;
    conf.set(IS_PIG_NON_PART_TABLE, "false");
    if (writeToPartitionedTable) {
        outputTableName = NUMBERS_PARTITIONED_TABLE_NAME;
        outputPartitionKvps.put("datestamp", "20100101");
    } else {
        if (writeToNonPartPigTable) {
            conf.set(IS_PIG_NON_PART_TABLE, "true");
            outputTableName = NUMBERS_NON_PARTITIONED_PIG_TABLE_NAME;
        } else {
            outputTableName = NUMBERS_NON_PARTITIONED_TABLE_NAME;
        }
        // test with null or empty randomly
        if (new Random().nextInt(2) == 0) {
            outputPartitionKvps = null;
        }
    }
    String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL);
    if (principalID != null)
        conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID);
    Job job = new Job(conf, "storenumbers");
    // initialize HCatInputFormat
    HCatInputFormat.setInput(job, dbName, tableName);
    // initialize HCatOutputFormat
    HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, outputTableName, outputPartitionKvps));
    // test with and without specifying schema randomly
    HCatSchema s = HCatInputFormat.getTableSchema(job);
    if (writeToNonPartPigTable) {
        List<HCatFieldSchema> newHfsList = new ArrayList<HCatFieldSchema>();
        // change smallint and tinyint to int
        for (HCatFieldSchema hfs : s.getFields()) {
            if (hfs.getTypeString().equals("smallint")) {
                newHfsList.add(new HCatFieldSchema(hfs.getName(), HCatFieldSchema.Type.INT, hfs.getComment()));
            } else if (hfs.getTypeString().equals("tinyint")) {
                newHfsList.add(new HCatFieldSchema(hfs.getName(), HCatFieldSchema.Type.INT, hfs.getComment()));
            } else {
                newHfsList.add(hfs);
            }
        }
        s = new HCatSchema(newHfsList);
    }
    HCatOutputFormat.setSchema(job, s);
    job.setInputFormatClass(HCatInputFormat.class);
    job.setOutputFormatClass(HCatOutputFormat.class);
    job.setJarByClass(StoreNumbers.class);
    job.setMapperClass(SumMapper.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setNumReduceTasks(0);
    job.setOutputValueClass(DefaultHCatRecord.class);
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) Random(java.util.Random) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) Job(org.apache.hadoop.mapreduce.Job) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser)

Aggregations

GenericOptionsParser (org.apache.hadoop.util.GenericOptionsParser)102 Configuration (org.apache.hadoop.conf.Configuration)72 Path (org.apache.hadoop.fs.Path)38 Job (org.apache.hadoop.mapreduce.Job)35 CommandLine (org.apache.commons.cli.CommandLine)18 IOException (java.io.IOException)15 HBaseConfiguration (org.apache.hadoop.hbase.HBaseConfiguration)11 PosixParser (org.apache.commons.cli.PosixParser)10 FileSystem (org.apache.hadoop.fs.FileSystem)10 HCatSchema (org.apache.hive.hcatalog.data.schema.HCatSchema)10 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)9 ParseException (org.apache.commons.cli.ParseException)7 Test (org.junit.jupiter.api.Test)7 ArrayList (java.util.ArrayList)6 Options (org.apache.commons.cli.Options)6 JobConf (org.apache.hadoop.mapred.JobConf)6 File (java.io.File)5 HashMap (java.util.HashMap)5 YarnUncaughtExceptionHandler (org.apache.hadoop.yarn.YarnUncaughtExceptionHandler)5 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)5