use of org.apache.hadoop.mapred.TextInputFormat in project hive by apache.
the class SymlinkTextInputFormat method getSplits.
/**
* Parses all target paths from job input directory which contains symlink
* files, and splits the target data using TextInputFormat.
*/
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
Path[] symlinksDirs = FileInputFormat.getInputPaths(job);
if (symlinksDirs.length == 0) {
throw new IOException("No input paths specified in job.");
}
// Get all target paths first, because the number of total target paths
// is used to determine number of splits of each target path.
List<Path> targetPaths = new ArrayList<Path>();
List<Path> symlinkPaths = new ArrayList<Path>();
try {
getTargetPathsFromSymlinksDirs(job, symlinksDirs, targetPaths, symlinkPaths);
} catch (Exception e) {
throw new IOException("Error parsing symlinks from specified job input path.", e);
}
if (targetPaths.size() == 0) {
return new InputSplit[0];
}
// The input should be in TextInputFormat.
TextInputFormat inputFormat = new TextInputFormat();
JobConf newjob = new JobConf(job);
newjob.setInputFormat(TextInputFormat.class);
inputFormat.configure(newjob);
List<InputSplit> result = new ArrayList<InputSplit>();
// ceil(numSplits / numPaths), so we can get at least numSplits splits.
int numPaths = targetPaths.size();
int numSubSplits = (numSplits + numPaths - 1) / numPaths;
// For each path, do getSplits().
for (int i = 0; i < numPaths; ++i) {
Path targetPath = targetPaths.get(i);
Path symlinkPath = symlinkPaths.get(i);
FileInputFormat.setInputPaths(newjob, targetPath);
InputSplit[] iss = inputFormat.getSplits(newjob, numSubSplits);
for (InputSplit is : iss) {
result.add(new SymlinkTextInputSplit(symlinkPath, (FileSplit) is));
}
}
return result.toArray(new InputSplit[result.size()]);
}
use of org.apache.hadoop.mapred.TextInputFormat in project apex-malhar by apache.
the class MapOperatorTest method testNodeProcessingSchema.
public void testNodeProcessingSchema(MapOperator<LongWritable, Text, Text, IntWritable> oper) throws IOException {
CollectorTestSink sortSink = new CollectorTestSink();
oper.output.setSink(sortSink);
oper.setMapClass(WordCount.Map.class);
oper.setCombineClass(WordCount.Reduce.class);
oper.setDirName(testMeta.testDir);
oper.setConfigFile(null);
oper.setInputFormatClass(TextInputFormat.class);
Configuration conf = new Configuration();
JobConf jobConf = new JobConf(conf);
FileInputFormat.setInputPaths(jobConf, new Path(testMeta.testDir));
TextInputFormat inputFormat = new TextInputFormat();
inputFormat.configure(jobConf);
InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
SerializationFactory serializationFactory = new SerializationFactory(conf);
Serializer keySerializer = serializationFactory.getSerializer(splits[0].getClass());
keySerializer.open(oper.getOutstream());
keySerializer.serialize(splits[0]);
oper.setInputSplitClass(splits[0].getClass());
keySerializer.close();
oper.setup(null);
oper.beginWindow(0);
oper.emitTuples();
oper.emitTuples();
oper.endWindow();
oper.beginWindow(1);
oper.emitTuples();
oper.endWindow();
Assert.assertEquals("number emitted tuples", 3, sortSink.collectedTuples.size());
for (Object o : sortSink.collectedTuples) {
LOG.debug(o.toString());
}
LOG.debug("Done testing round\n");
oper.teardown();
}
use of org.apache.hadoop.mapred.TextInputFormat in project hadoop by apache.
the class ValueAggregatorJob method createValueAggregatorJob.
/**
* Create an Aggregate based map/reduce job.
*
* @param args the arguments used for job creation. Generic hadoop
* arguments are accepted.
* @param caller the the caller class.
* @return a JobConf object ready for submission.
*
* @throws IOException
* @see GenericOptionsParser
*/
@SuppressWarnings("rawtypes")
public static JobConf createValueAggregatorJob(String[] args, Class<?> caller) throws IOException {
Configuration conf = new Configuration();
GenericOptionsParser genericParser = new GenericOptionsParser(conf, args);
args = genericParser.getRemainingArgs();
if (args.length < 2) {
System.out.println("usage: inputDirs outDir " + "[numOfReducer [textinputformat|seq [specfile [jobName]]]]");
GenericOptionsParser.printGenericCommandUsage(System.out);
System.exit(1);
}
String inputDir = args[0];
String outputDir = args[1];
int numOfReducers = 1;
if (args.length > 2) {
numOfReducers = Integer.parseInt(args[2]);
}
Class<? extends InputFormat> theInputFormat = TextInputFormat.class;
if (args.length > 3 && args[3].compareToIgnoreCase("textinputformat") == 0) {
theInputFormat = TextInputFormat.class;
} else {
theInputFormat = SequenceFileInputFormat.class;
}
Path specFile = null;
if (args.length > 4) {
specFile = new Path(args[4]);
}
String jobName = "";
if (args.length > 5) {
jobName = args[5];
}
JobConf theJob = new JobConf(conf);
if (specFile != null) {
theJob.addResource(specFile);
}
String userJarFile = theJob.get("user.jar.file");
if (userJarFile == null) {
theJob.setJarByClass(caller != null ? caller : ValueAggregatorJob.class);
} else {
theJob.setJar(userJarFile);
}
theJob.setJobName("ValueAggregatorJob: " + jobName);
FileInputFormat.addInputPaths(theJob, inputDir);
theJob.setInputFormat(theInputFormat);
theJob.setMapperClass(ValueAggregatorMapper.class);
FileOutputFormat.setOutputPath(theJob, new Path(outputDir));
theJob.setOutputFormat(TextOutputFormat.class);
theJob.setMapOutputKeyClass(Text.class);
theJob.setMapOutputValueClass(Text.class);
theJob.setOutputKeyClass(Text.class);
theJob.setOutputValueClass(Text.class);
theJob.setReducerClass(ValueAggregatorReducer.class);
theJob.setCombinerClass(ValueAggregatorCombiner.class);
theJob.setNumMapTasks(1);
theJob.setNumReduceTasks(numOfReducers);
return theJob;
}
use of org.apache.hadoop.mapred.TextInputFormat in project SQLWindowing by hbutani.
the class IOUtils method createTableWindowingInput.
@SuppressWarnings("unchecked")
public static WindowingInput createTableWindowingInput(String dbName, String tableName, Configuration conf) throws WindowingException {
try {
HiveMetaStoreClient client = HiveUtils.getClient(conf);
String db = HiveUtils.validateDB(client, dbName);
Table t = HiveUtils.getTable(client, db, tableName);
StorageDescriptor sd = t.getSd();
HiveConf hConf = new HiveConf(conf, IOUtils.class);
JobConf job = new JobConf(hConf);
Class<? extends InputFormat<? extends Writable, ? extends Writable>> inputFormatClass = (Class<? extends InputFormat<? extends Writable, ? extends Writable>>) Class.forName(sd.getInputFormat());
hConf.setClass("mapred.input.format.class", inputFormatClass, InputFormat.class);
hConf.set(INPUT_INPUTFORMAT_CLASS, inputFormatClass.getName());
InputFormat<? extends Writable, ? extends Writable> iFmt = inputFormatClass.newInstance();
if (iFmt instanceof TextInputFormat) {
((TextInputFormat) iFmt).configure(job);
}
Path p = new Path(sd.getLocation());
/*
* Convert the Path in the StorageDescriptor into a Path in the current FileSystem.
* Used in testing: Jobs run on MiniDFSCluster, whereas hive metadata refers to a real cluster.
*/
{
p = makeQualified(p, conf);
}
FileInputFormat.addInputPath(job, p);
InputSplit[] iSplits = iFmt.getSplits(job, 1);
org.apache.hadoop.mapred.RecordReader<Writable, Writable> rdr = (org.apache.hadoop.mapred.RecordReader<Writable, Writable>) iFmt.getRecordReader(iSplits[0], job, Reporter.NULL);
hConf.set(INPUT_PATH, sd.getLocation());
hConf.set(INPUT_KEY_CLASS, rdr.createKey().getClass().getName());
hConf.set(INPUT_VALUE_CLASS, rdr.createValue().getClass().getName());
hConf.set(INPUT_SERDE_CLASS, sd.getSerdeInfo().getSerializationLib());
TableWindowingInput tIn = new TableWindowingInput();
tIn.initialize(null, hConf, MetaStoreUtils.getSchema(t));
return tIn;
} catch (WindowingException w) {
throw w;
} catch (Exception e) {
throw new WindowingException(e);
}
}
use of org.apache.hadoop.mapred.TextInputFormat in project incubator-systemml by apache.
the class ResultMergeLocalFile method mergeTextCellWithoutComp.
private static void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO) {
try {
// delete target file if already exists
MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
if (ALLOW_COPY_CELLFILES) {
copyAllFiles(fnameNew, inMO);
// we're done
return;
}
// actual merge
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path(fnameNew);
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));
String valueStr = null;
try {
for (// read/write all inputs
MatrixObject in : // read/write all inputs
inMO) {
if (LOG.isTraceEnabled())
LOG.trace("ResultMerge (local, file): Merge input " + in.hashCode() + " (fname=" + in.getFileName() + ") via stream merge");
JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
Path tmpPath = new Path(in.getFileName());
FileInputFormat.addInputPath(tmpJob, tmpPath);
TextInputFormat informat = new TextInputFormat();
informat.configure(tmpJob);
InputSplit[] splits = informat.getSplits(tmpJob, 1);
LongWritable key = new LongWritable();
Text value = new Text();
for (InputSplit split : splits) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob, Reporter.NULL);
try {
while (reader.next(key, value)) {
valueStr = value.toString().trim();
out.write(valueStr + "\n");
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
}
} finally {
IOUtilFunctions.closeSilently(out);
}
} catch (Exception ex) {
throw new DMLRuntimeException("Unable to merge text cell results.", ex);
}
}
Aggregations