use of org.apache.hadoop.util.GenericOptionsParser in project shifu by ShifuML.
the class PostTrainModelProcessor method runMRFeatureImportanceJob.
private void runMRFeatureImportanceJob(SourceType source, String output) throws IOException, InterruptedException, ClassNotFoundException {
final Configuration conf = new Configuration();
// add jars to hadoop mapper and reducer
new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars() });
conf.setBoolean(CombineInputFormat.SHIFU_VS_SPLIT_COMBINABLE, true);
conf.setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true);
conf.set(Constants.SHIFU_STATS_EXLCUDE_MISSING, Environment.getProperty(Constants.SHIFU_STATS_EXLCUDE_MISSING, "true"));
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
conf.set(Constants.SHIFU_MODEL_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getModelConfigPath(source))).toString());
conf.set(Constants.SHIFU_COLUMN_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.getPathFinder().getColumnConfigPath(source))).toString());
conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME, Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
conf.set(Constants.SHIFU_MODELSET_SOURCE_TYPE, source.toString());
// set mapreduce.job.max.split.locations to 30 to suppress warnings
conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 5000);
conf.set("mapred.reduce.slowstart.completed.maps", Environment.getProperty("mapred.reduce.slowstart.completed.maps", "0.8"));
String hdpVersion = HDPUtils.getHdpVersionForHDP224();
if (StringUtils.isNotBlank(hdpVersion)) {
// for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
conf.set("hdp.version", hdpVersion);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
}
// one can set guagua conf in shifuconfig
CommonUtils.injectHadoopShifuEnvironments(new ValueVisitor() {
@Override
public void inject(Object key, Object value) {
conf.set(key.toString(), value.toString());
}
});
@SuppressWarnings("deprecation") Job job = new Job(conf, "Shifu: Post Train FeatureImportance : " + this.modelConfig.getModelSetName());
job.setJarByClass(getClass());
job.setMapperClass(FeatureImportanceMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(DoubleWritable.class);
job.setInputFormatClass(CombineInputFormat.class);
FileInputFormat.setInputPaths(job, ShifuFileUtils.getFileSystemBySourceType(source).makeQualified(new Path(super.modelConfig.getDataSetRawPath())));
job.setReducerClass(FeatureImportanceReducer.class);
job.setNumReduceTasks(1);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(DoubleWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path(output));
// clean output firstly
ShifuFileUtils.deleteFile(output, source);
// submit job
if (!job.waitForCompletion(true)) {
throw new RuntimeException("Post train Feature Importance MapReduce job is failed.");
}
}
use of org.apache.hadoop.util.GenericOptionsParser in project Cloud9 by lintool.
the class FileMerger method run.
@Override
public /**
* TODO: add in hadoop configuration
*/
int run(String[] args) throws IOException {
Options options = new Options();
options.addOption(HELP_OPTION, false, "print the help message");
options.addOption(OptionBuilder.withArgName(PATH_INDICATOR).hasArg().withDescription("input file or directory").create(INPUT_OPTION));
options.addOption(OptionBuilder.withArgName(PATH_INDICATOR).hasArg().withDescription("output file").create(OUTPUT_OPTION));
options.addOption(OptionBuilder.withArgName(INTEGER_INDICATOR).hasArg().withDescription("number of mappers (default to 0 and hence local merge mode, set to positive value to enable cluster merge mode)").create(MAPPER_OPTION));
options.addOption(OptionBuilder.withArgName("property=value").hasArgs(2).withValueSeparator().withDescription("assign value for given property").create("D"));
options.addOption(TEXT_FILE_INPUT_FORMAT, false, "input file in sequence format");
options.addOption(DELETE_SOURCE_OPTION, false, "delete sources after merging");
int mapperTasks = 0;
boolean deleteSource = DELETE_SOURCE;
boolean textFileFormat = TEXT_FILE_INPUT;
String inputPath = "";
String outputPath = "";
GenericOptionsParser genericOptionsParser = new GenericOptionsParser(args);
Configuration configuration = genericOptionsParser.getConfiguration();
CommandLineParser parser = new GnuParser();
HelpFormatter formatter = new HelpFormatter();
try {
CommandLine line = parser.parse(options, args);
if (line.hasOption(HELP_OPTION)) {
formatter.printHelp(FileMerger.class.getName(), options);
System.exit(0);
}
if (line.hasOption(INPUT_OPTION)) {
inputPath = line.getOptionValue(INPUT_OPTION);
} else {
throw new ParseException("Parsing failed due to " + INPUT_OPTION + " not initialized...");
}
if (line.hasOption(OUTPUT_OPTION)) {
outputPath = line.getOptionValue(OUTPUT_OPTION);
} else {
throw new ParseException("Parsing failed due to " + OUTPUT_OPTION + " not initialized...");
}
if (line.hasOption(MAPPER_OPTION)) {
mapperTasks = Integer.parseInt(line.getOptionValue(MAPPER_OPTION));
if (mapperTasks <= 0) {
sLogger.info("Warning: " + MAPPER_OPTION + " is not positive, merge in local model...");
mapperTasks = 0;
}
}
if (line.hasOption(DELETE_SOURCE_OPTION)) {
deleteSource = true;
}
if (line.hasOption(TEXT_FILE_INPUT_FORMAT)) {
textFileFormat = true;
}
} catch (ParseException pe) {
System.err.println(pe.getMessage());
formatter.printHelp(FileMerger.class.getName(), options);
System.exit(0);
} catch (NumberFormatException nfe) {
System.err.println(nfe.getMessage());
System.exit(0);
}
try {
merge(configuration, inputPath, outputPath, mapperTasks, textFileFormat, deleteSource);
} catch (InstantiationException ie) {
ie.printStackTrace();
} catch (IllegalAccessException iae) {
iae.printStackTrace();
}
return 0;
}
use of org.apache.hadoop.util.GenericOptionsParser in project hive by apache.
the class WriteRC method run.
public int run(String[] args) throws Exception {
Configuration conf = getConf();
args = new GenericOptionsParser(conf, args).getRemainingArgs();
String serverUri = args[0];
String inputTableName = args[1];
String outputTableName = args[2];
String dbName = null;
String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL);
if (principalID != null)
conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID);
Job job = new Job(conf, "WriteRC");
HCatInputFormat.setInput(job, dbName, inputTableName);
// initialize HCatOutputFormat
job.setInputFormatClass(HCatInputFormat.class);
job.setJarByClass(WriteRC.class);
job.setMapperClass(Map.class);
job.setOutputKeyClass(WritableComparable.class);
job.setOutputValueClass(DefaultHCatRecord.class);
job.setNumReduceTasks(0);
HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, outputTableName, null));
HCatSchema s = HCatInputFormat.getTableSchema(job);
System.err.println("INFO: output schema explicitly set for writing:" + s);
HCatOutputFormat.setSchema(job, s);
job.setOutputFormatClass(HCatOutputFormat.class);
return (job.waitForCompletion(true) ? 0 : 1);
}
use of org.apache.hadoop.util.GenericOptionsParser in project hive by apache.
the class WriteTextPartitioned method run.
public int run(String[] args) throws Exception {
Configuration conf = getConf();
args = new GenericOptionsParser(conf, args).getRemainingArgs();
String serverUri = args[0];
String inputTableName = args[1];
String outputTableName = args[2];
if (args.length > 3)
filter = args[3];
String dbName = null;
String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL);
if (principalID != null)
conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID);
Job job = new Job(conf, "WriteTextPartitioned");
HCatInputFormat.setInput(job, dbName, inputTableName, filter);
// initialize HCatOutputFormat
job.setInputFormatClass(HCatInputFormat.class);
job.setJarByClass(WriteTextPartitioned.class);
job.setMapperClass(Map.class);
job.setOutputKeyClass(WritableComparable.class);
job.setOutputValueClass(DefaultHCatRecord.class);
job.setNumReduceTasks(0);
java.util.Map<String, String> partitionVals = null;
if (filter != null) {
String[] s = filter.split("=");
String val = s[1].replace('"', ' ').trim();
partitionVals = new HashMap<String, String>(1);
partitionVals.put(s[0], val);
}
HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, outputTableName, partitionVals));
HCatSchema s = HCatInputFormat.getTableSchema(job);
// Build the schema for this table, which is slightly different than the
// schema for the input table
List<HCatFieldSchema> fss = new ArrayList<HCatFieldSchema>(3);
fss.add(s.get(0));
fss.add(s.get(1));
fss.add(s.get(3));
HCatOutputFormat.setSchema(job, new HCatSchema(fss));
job.setOutputFormatClass(HCatOutputFormat.class);
return (job.waitForCompletion(true) ? 0 : 1);
}
use of org.apache.hadoop.util.GenericOptionsParser in project hive by apache.
the class StoreNumbers method main.
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
args = new GenericOptionsParser(conf, args).getRemainingArgs();
String[] otherArgs = new String[2];
int j = 0;
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-libjars")) {
// generic options parser doesn't seem to work!
conf.set("tmpjars", args[i + 1]);
// skip it , the for loop will skip its value
i = i + 1;
} else {
otherArgs[j++] = args[i];
}
}
if (otherArgs.length != 2) {
usage();
}
String serverUri = otherArgs[0];
if (otherArgs[1] == null || (!otherArgs[1].equalsIgnoreCase("part") && !otherArgs[1].equalsIgnoreCase("nopart")) && !otherArgs[1].equalsIgnoreCase("nopart_pig")) {
usage();
}
boolean writeToPartitionedTable = (otherArgs[1].equalsIgnoreCase("part"));
boolean writeToNonPartPigTable = (otherArgs[1].equalsIgnoreCase("nopart_pig"));
String tableName = NUMBERS_TABLE_NAME;
String dbName = "default";
Map<String, String> outputPartitionKvps = new HashMap<String, String>();
String outputTableName = null;
conf.set(IS_PIG_NON_PART_TABLE, "false");
if (writeToPartitionedTable) {
outputTableName = NUMBERS_PARTITIONED_TABLE_NAME;
outputPartitionKvps.put("datestamp", "20100101");
} else {
if (writeToNonPartPigTable) {
conf.set(IS_PIG_NON_PART_TABLE, "true");
outputTableName = NUMBERS_NON_PARTITIONED_PIG_TABLE_NAME;
} else {
outputTableName = NUMBERS_NON_PARTITIONED_TABLE_NAME;
}
// test with null or empty randomly
if (new Random().nextInt(2) == 0) {
outputPartitionKvps = null;
}
}
String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL);
if (principalID != null)
conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID);
Job job = new Job(conf, "storenumbers");
// initialize HCatInputFormat
HCatInputFormat.setInput(job, dbName, tableName);
// initialize HCatOutputFormat
HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, outputTableName, outputPartitionKvps));
// test with and without specifying schema randomly
HCatSchema s = HCatInputFormat.getTableSchema(job);
if (writeToNonPartPigTable) {
List<HCatFieldSchema> newHfsList = new ArrayList<HCatFieldSchema>();
// change smallint and tinyint to int
for (HCatFieldSchema hfs : s.getFields()) {
if (hfs.getTypeString().equals("smallint")) {
newHfsList.add(new HCatFieldSchema(hfs.getName(), HCatFieldSchema.Type.INT, hfs.getComment()));
} else if (hfs.getTypeString().equals("tinyint")) {
newHfsList.add(new HCatFieldSchema(hfs.getName(), HCatFieldSchema.Type.INT, hfs.getComment()));
} else {
newHfsList.add(hfs);
}
}
s = new HCatSchema(newHfsList);
}
HCatOutputFormat.setSchema(job, s);
job.setInputFormatClass(HCatInputFormat.class);
job.setOutputFormatClass(HCatOutputFormat.class);
job.setJarByClass(StoreNumbers.class);
job.setMapperClass(SumMapper.class);
job.setOutputKeyClass(IntWritable.class);
job.setNumReduceTasks(0);
job.setOutputValueClass(DefaultHCatRecord.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
Aggregations