use of org.apache.hadoop.mapred.InputFormat in project hadoop by apache.
the class Submitter method run.
@Override
public int run(String[] args) throws Exception {
CommandLineParser cli = new CommandLineParser();
if (args.length == 0) {
cli.printUsage();
return 1;
}
cli.addOption("input", false, "input path to the maps", "path");
cli.addOption("output", false, "output path from the reduces", "path");
cli.addOption("jar", false, "job jar file", "path");
cli.addOption("inputformat", false, "java classname of InputFormat", "class");
//cli.addArgument("javareader", false, "is the RecordReader in Java");
cli.addOption("map", false, "java classname of Mapper", "class");
cli.addOption("partitioner", false, "java classname of Partitioner", "class");
cli.addOption("reduce", false, "java classname of Reducer", "class");
cli.addOption("writer", false, "java classname of OutputFormat", "class");
cli.addOption("program", false, "URI to application executable", "class");
cli.addOption("reduces", false, "number of reduces", "num");
cli.addOption("jobconf", false, "\"n1=v1,n2=v2,..\" (Deprecated) Optional. Add or override a JobConf property.", "key=val");
cli.addOption("lazyOutput", false, "Optional. Create output lazily", "boolean");
Parser parser = cli.createParser();
try {
GenericOptionsParser genericParser = new GenericOptionsParser(getConf(), args);
CommandLine results = parser.parse(cli.options, genericParser.getRemainingArgs());
JobConf job = new JobConf(getConf());
if (results.hasOption("input")) {
FileInputFormat.setInputPaths(job, results.getOptionValue("input"));
}
if (results.hasOption("output")) {
FileOutputFormat.setOutputPath(job, new Path(results.getOptionValue("output")));
}
if (results.hasOption("jar")) {
job.setJar(results.getOptionValue("jar"));
}
if (results.hasOption("inputformat")) {
setIsJavaRecordReader(job, true);
job.setInputFormat(getClass(results, "inputformat", job, InputFormat.class));
}
if (results.hasOption("javareader")) {
setIsJavaRecordReader(job, true);
}
if (results.hasOption("map")) {
setIsJavaMapper(job, true);
job.setMapperClass(getClass(results, "map", job, Mapper.class));
}
if (results.hasOption("partitioner")) {
job.setPartitionerClass(getClass(results, "partitioner", job, Partitioner.class));
}
if (results.hasOption("reduce")) {
setIsJavaReducer(job, true);
job.setReducerClass(getClass(results, "reduce", job, Reducer.class));
}
if (results.hasOption("reduces")) {
job.setNumReduceTasks(Integer.parseInt(results.getOptionValue("reduces")));
}
if (results.hasOption("writer")) {
setIsJavaRecordWriter(job, true);
job.setOutputFormat(getClass(results, "writer", job, OutputFormat.class));
}
if (results.hasOption("lazyOutput")) {
if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) {
LazyOutputFormat.setOutputFormatClass(job, job.getOutputFormat().getClass());
}
}
if (results.hasOption("program")) {
setExecutable(job, results.getOptionValue("program"));
}
if (results.hasOption("jobconf")) {
LOG.warn("-jobconf option is deprecated, please use -D instead.");
String options = results.getOptionValue("jobconf");
StringTokenizer tokenizer = new StringTokenizer(options, ",");
while (tokenizer.hasMoreTokens()) {
String keyVal = tokenizer.nextToken().trim();
String[] keyValSplit = keyVal.split("=");
job.set(keyValSplit[0], keyValSplit[1]);
}
}
// if they gave us a jar file, include it into the class path
String jarFile = job.getJar();
if (jarFile != null) {
final URL[] urls = new URL[] { FileSystem.getLocal(job).pathToFile(new Path(jarFile)).toURL() };
//FindBugs complains that creating a URLClassLoader should be
//in a doPrivileged() block.
ClassLoader loader = AccessController.doPrivileged(new PrivilegedAction<ClassLoader>() {
public ClassLoader run() {
return new URLClassLoader(urls);
}
});
job.setClassLoader(loader);
}
runJob(job);
return 0;
} catch (ParseException pe) {
LOG.info("Error : " + pe);
cli.printUsage();
return 1;
}
}
use of org.apache.hadoop.mapred.InputFormat in project hadoop by apache.
the class MultipleInputs method getInputFormatMap.
/**
* Retrieves a map of {@link Path}s to the {@link InputFormat} class
* that should be used for them.
*
* @param conf The confuration of the job
* @see #addInputPath(JobConf, Path, Class)
* @return A map of paths to inputformats for the job
*/
static Map<Path, InputFormat> getInputFormatMap(JobConf conf) {
Map<Path, InputFormat> m = new HashMap<Path, InputFormat>();
String[] pathMappings = conf.get("mapreduce.input.multipleinputs.dir.formats").split(",");
for (String pathMapping : pathMappings) {
String[] split = pathMapping.split(";");
InputFormat inputFormat;
try {
inputFormat = (InputFormat) ReflectionUtils.newInstance(conf.getClassByName(split[1]), conf);
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
m.put(new Path(split[0]), inputFormat);
}
return m;
}
use of org.apache.hadoop.mapred.InputFormat in project hadoop by apache.
the class TestMultipleInputs method testAddInputPathWithMapper.
@Test
public void testAddInputPathWithMapper() {
final JobConf conf = new JobConf();
MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class, MapClass.class);
MultipleInputs.addInputPath(conf, new Path("/bar"), KeyValueTextInputFormat.class, MapClass2.class);
final Map<Path, InputFormat> inputs = MultipleInputs.getInputFormatMap(conf);
final Map<Path, Class<? extends Mapper>> maps = MultipleInputs.getMapperTypeMap(conf);
assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar")).getClass());
assertEquals(MapClass.class, maps.get(new Path("/foo")));
assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
use of org.apache.hadoop.mapred.InputFormat in project hive by apache.
the class TestOrcRawRecordMerger method testRecordReaderIncompleteDelta.
/**
*
* @param use130Format true means use delta_0001_0001_0000 format, else delta_0001_00001
*/
private void testRecordReaderIncompleteDelta(boolean use130Format) throws Exception {
final int BUCKET = 1;
Configuration conf = new Configuration();
OrcOutputFormat of = new OrcOutputFormat();
FileSystem fs = FileSystem.getLocal(conf).getRaw();
Path root = new Path(tmpDir, "testRecordReaderIncompleteDelta").makeQualified(fs);
fs.delete(root, true);
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
// write a base
AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).writingBase(true).minimumTransactionId(0).maximumTransactionId(0).bucket(BUCKET).inspector(inspector).filesystem(fs).finalDestination(root);
if (!use130Format) {
options.statementId(-1);
}
RecordUpdater ru = of.getRecordUpdater(root, options);
String[] values = new String[] { "1", "2", "3", "4", "5" };
for (int i = 0; i < values.length; ++i) {
ru.insert(0, new MyRow(values[i]));
}
ru.close(false);
// write a delta
options.writingBase(false).minimumTransactionId(10).maximumTransactionId(19);
ru = of.getRecordUpdater(root, options);
values = new String[] { "6", "7", "8" };
for (int i = 0; i < values.length; ++i) {
ru.insert(1, new MyRow(values[i]));
}
InputFormat inf = new OrcInputFormat();
JobConf job = new JobConf();
job.set("mapred.input.dir", root.toString());
job.set("bucket_count", "2");
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
HiveConf.setBoolVar(job, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
// read the keys before the delta is flushed
InputSplit[] splits = inf.getSplits(job, 1);
assertEquals(2, splits.length);
org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
NullWritable key = rr.createKey();
OrcStruct value = rr.createValue();
System.out.println("Looking at split " + splits[0]);
for (int i = 1; i < 6; ++i) {
System.out.println("Checking row " + i);
assertEquals(true, rr.next(key, value));
assertEquals(Integer.toString(i), value.getFieldValue(0).toString());
}
assertEquals(false, rr.next(key, value));
ru.flush();
ru.flush();
values = new String[] { "9", "10" };
for (int i = 0; i < values.length; ++i) {
ru.insert(3, new MyRow(values[i]));
}
ru.flush();
splits = inf.getSplits(job, 1);
assertEquals(2, splits.length);
rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
Path sideFile = new Path(root + "/" + (use130Format ? AcidUtils.deltaSubdir(10, 19, 0) : AcidUtils.deltaSubdir(10, 19)) + "/bucket_00001_flush_length");
assertEquals(true, fs.exists(sideFile));
assertEquals(24, fs.getFileStatus(sideFile).getLen());
for (int i = 1; i < 11; ++i) {
assertEquals(true, rr.next(key, value));
assertEquals(Integer.toString(i), value.getFieldValue(0).toString());
}
assertEquals(false, rr.next(key, value));
}
use of org.apache.hadoop.mapred.InputFormat in project hive by apache.
the class TestOrcRawRecordMerger method testRecordReaderNewBaseAndDelta.
/**
* Test the RecordReader when there is a new base and a delta.
* @throws Exception
*/
@Test
public void testRecordReaderNewBaseAndDelta() throws Exception {
final int BUCKET = 11;
Configuration conf = new Configuration();
OrcOutputFormat of = new OrcOutputFormat();
FileSystem fs = FileSystem.getLocal(conf);
Path root = new Path(tmpDir, "testRecordReaderNewBaseAndDelta").makeQualified(fs);
fs.delete(root, true);
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
// write the base
MemoryManager mgr = new MemoryManager(conf) {
int rowsAddedSinceCheck = 0;
@Override
public synchronized void addedRow(int rows) throws IOException {
rowsAddedSinceCheck += rows;
if (rowsAddedSinceCheck >= 2) {
notifyWriters();
rowsAddedSinceCheck = 0;
}
}
};
// make 5 stripes with 2 rows each
OrcRecordUpdater.OrcOptions options = (OrcRecordUpdater.OrcOptions) new OrcRecordUpdater.OrcOptions(conf).writingBase(true).minimumTransactionId(0).maximumTransactionId(0).bucket(BUCKET).inspector(inspector).filesystem(fs);
options.orcOptions(OrcFile.writerOptions(conf).stripeSize(1).blockPadding(false).compress(CompressionKind.NONE).memory(mgr).batchSize(2));
options.finalDestination(root);
RecordUpdater ru = of.getRecordUpdater(root, options);
String[] values = new String[] { "ignore.1", "0.1", "ignore.2", "ignore.3", "2.0", "2.1", "3.0", "ignore.4", "ignore.5", "ignore.6" };
for (int i = 0; i < values.length; ++i) {
ru.insert(0, new BigRow(i, i, values[i], i, i));
}
ru.close(false);
// write a delta
options.writingBase(false).minimumTransactionId(1).maximumTransactionId(1).recordIdColumn(5);
ru = of.getRecordUpdater(root, options);
values = new String[] { "0.0", null, null, "1.1", null, null, null, "ignore.7" };
for (int i = 0; i < values.length; ++i) {
if (values[i] != null) {
ru.update(1, new BigRow(i, i, values[i], i, i, i, 0, BUCKET));
}
}
ru.delete(100, new BigRow(9, 0, BUCKET));
ru.close(false);
// write a delta
options.minimumTransactionId(2).maximumTransactionId(2);
ru = of.getRecordUpdater(root, options);
values = new String[] { null, null, "1.0", null, null, null, null, "3.1" };
for (int i = 0; i < values.length; ++i) {
if (values[i] != null) {
ru.update(2, new BigRow(i, i, values[i], i, i, i, 0, BUCKET));
}
}
ru.delete(100, new BigRow(8, 0, BUCKET));
ru.close(false);
InputFormat inf = new OrcInputFormat();
JobConf job = new JobConf();
job.set("mapred.min.split.size", "1");
job.set("mapred.max.split.size", "2");
job.set("mapred.input.dir", root.toString());
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
HiveConf.setBoolVar(job, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
InputSplit[] splits = inf.getSplits(job, 5);
assertEquals(5, splits.length);
org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr;
// loop through the 5 splits and read each
for (int i = 0; i < 4; ++i) {
System.out.println("starting split " + i + " = " + splits[i]);
rr = inf.getRecordReader(splits[i], job, Reporter.NULL);
NullWritable key = rr.createKey();
OrcStruct value = rr.createValue();
// there should be exactly two rows per a split
for (int j = 0; j < 2; ++j) {
System.out.println("i = " + i + ", j = " + j);
assertEquals(true, rr.next(key, value));
System.out.println("record = " + value);
assertEquals(i + "." + j, value.getFieldValue(2).toString());
}
assertEquals(false, rr.next(key, value));
}
rr = inf.getRecordReader(splits[4], job, Reporter.NULL);
assertEquals(false, rr.next(rr.createKey(), rr.createValue()));
}
Aggregations