use of org.apache.hadoop.mapred.RecordWriter in project hive by apache.
the class TestInputOutputFormat method testSplitElimination.
@Test
public void testSplitElimination() throws Exception {
Properties properties = new Properties();
properties.setProperty("columns", "z,r");
properties.setProperty("columns.types", "int:struct<x:int,y:int>");
StructObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(NestedRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
AbstractSerDe serde = new OrcSerde();
OutputFormat<?, ?> outFormat = new OrcOutputFormat();
conf.setInt("mapred.max.split.size", 50);
RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL);
writer.write(NullWritable.get(), serde.serialize(new NestedRow(1, 2, 3), inspector));
writer.write(NullWritable.get(), serde.serialize(new NestedRow(4, 5, 6), inspector));
writer.write(NullWritable.get(), serde.serialize(new NestedRow(7, 8, 9), inspector));
writer.close(Reporter.NULL);
serde = new OrcSerde();
SearchArgument sarg = SearchArgumentFactory.newBuilder().startAnd().lessThan("z", PredicateLeaf.Type.LONG, new Long(0)).end().build();
conf.set("sarg.pushdown", toKryo(sarg));
conf.set("hive.io.file.readcolumn.names", "z,r");
SerDeUtils.initializeSerDe(serde, conf, properties, null);
inspector = (StructObjectInspector) serde.getObjectInspector();
InputFormat<?, ?> in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, testFilePath.toString());
InputSplit[] splits = in.getSplits(conf, 1);
assertEquals(0, splits.length);
}
use of org.apache.hadoop.mapred.RecordWriter in project hive by apache.
the class TestInputOutputFormat method testSplitEliminationNullStats.
@Test
public void testSplitEliminationNullStats() throws Exception {
Properties properties = new Properties();
StructObjectInspector inspector = createSoi();
AbstractSerDe serde = new OrcSerde();
OutputFormat<?, ?> outFormat = new OrcOutputFormat();
conf.setInt("mapred.max.split.size", 50);
RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL);
writer.write(NullWritable.get(), serde.serialize(new SimpleRow(null), inspector));
writer.write(NullWritable.get(), serde.serialize(new SimpleRow(null), inspector));
writer.write(NullWritable.get(), serde.serialize(new SimpleRow(null), inspector));
writer.close(Reporter.NULL);
serde = new OrcSerde();
SearchArgument sarg = SearchArgumentFactory.newBuilder().startAnd().lessThan("z", PredicateLeaf.Type.STRING, new String("foo")).end().build();
conf.set("sarg.pushdown", toKryo(sarg));
conf.set("hive.io.file.readcolumn.names", "z");
properties.setProperty("columns", "z");
properties.setProperty("columns.types", "string");
SerDeUtils.initializeSerDe(serde, conf, properties, null);
inspector = (StructObjectInspector) serde.getObjectInspector();
InputFormat<?, ?> in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, testFilePath.toString());
InputSplit[] splits = in.getSplits(conf, 1);
assertEquals(0, splits.length);
}
use of org.apache.hadoop.mapred.RecordWriter in project hive by apache.
the class DynamicPartitionFileRecordWriterContainer method getLocalFileWriter.
@Override
protected LocalFileWriter getLocalFileWriter(HCatRecord value) throws IOException, HCatException {
OutputJobInfo localJobInfo = null;
// Calculate which writer to use from the remaining values - this needs to
// be done before we delete cols.
List<String> dynamicPartValues = new ArrayList<String>();
for (Integer colToAppend : dynamicPartCols) {
Object partitionValue = value.get(colToAppend);
dynamicPartValues.add(partitionValue == null ? HIVE_DEFAULT_PARTITION_VALUE : partitionValue.toString());
}
String dynKey = dynamicPartValues.toString();
if (!baseDynamicWriters.containsKey(dynKey)) {
if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) {
throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS, "Number of dynamic partitions being created " + "exceeds configured max allowable partitions[" + maxDynamicPartitions + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname + "] if needed.");
}
org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil.createTaskAttemptContext(context);
configureDynamicStorageHandler(currTaskContext, dynamicPartValues);
localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext.getConfiguration());
// Setup serDe.
AbstractSerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(), currTaskContext.getJobConf());
try {
InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo);
} catch (SerDeException e) {
throw new IOException("Failed to initialize SerDe", e);
}
// create base OutputFormat
org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils.newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf());
// We are skipping calling checkOutputSpecs() for each partition
// As it can throw a FileAlreadyExistsException when more than one
// mapper is writing to a partition.
// See HCATALOG-490, also to avoid contacting the namenode for each new
// FileOutputFormat instance.
// In general this should be ok for most FileOutputFormat implementations
// but may become an issue for cases when the method is used to perform
// other setup tasks.
// Get Output Committer
org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf().getOutputCommitter();
// Create currJobContext the latest so it gets all the config changes
org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil.createJobContext(currTaskContext);
// Set up job.
baseOutputCommitter.setupJob(currJobContext);
// Recreate to refresh jobConf of currTask context.
currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(), currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible());
// Set temp location.
currTaskContext.getConfiguration().set("mapred.work.output.dir", new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath().toString());
// Set up task.
baseOutputCommitter.setupTask(currTaskContext);
Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir"));
Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext, currTaskContext.getConfiguration().get("mapreduce.output.basename", "part"), ""));
RecordWriter baseRecordWriter = baseOF.getRecordWriter(parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(), childPath.toString(), InternalUtil.createReporter(currTaskContext));
baseDynamicWriters.put(dynKey, baseRecordWriter);
baseDynamicSerDe.put(dynKey, currSerDe);
baseDynamicCommitters.put(dynKey, baseOutputCommitter);
dynamicContexts.put(dynKey, currTaskContext);
dynamicObjectInspectors.put(dynKey, InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema()));
dynamicOutputJobInfo.put(dynKey, HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey).getConfiguration()));
}
return new LocalFileWriter(baseDynamicWriters.get(dynKey), dynamicObjectInspectors.get(dynKey), baseDynamicSerDe.get(dynKey), dynamicOutputJobInfo.get(dynKey));
}
use of org.apache.hadoop.mapred.RecordWriter in project hadoop by apache.
the class MultipleOutputFormat method getRecordWriter.
/**
* Create a composite record writer that can write key/value data to different
* output files
*
* @param fs
* the file system to use
* @param job
* the job conf for the job
* @param name
* the leaf file name for the output file (such as part-00000")
* @param arg3
* a progressable for reporting progress.
* @return a composite record writer
* @throws IOException
*/
public RecordWriter<K, V> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable arg3) throws IOException {
final FileSystem myFS = fs;
final String myName = generateLeafFileName(name);
final JobConf myJob = job;
final Progressable myProgressable = arg3;
return new RecordWriter<K, V>() {
// a cache storing the record writers for different output files.
TreeMap<String, RecordWriter<K, V>> recordWriters = new TreeMap<String, RecordWriter<K, V>>();
public void write(K key, V value) throws IOException {
// get the file name based on the key
String keyBasedPath = generateFileNameForKeyValue(key, value, myName);
// get the file name based on the input file name
String finalPath = getInputFileBasedOutputFileName(myJob, keyBasedPath);
// get the actual key
K actualKey = generateActualKey(key, value);
V actualValue = generateActualValue(key, value);
RecordWriter<K, V> rw = this.recordWriters.get(finalPath);
if (rw == null) {
// if we don't have the record writer yet for the final path, create
// one
// and add it to the cache
rw = getBaseRecordWriter(myFS, myJob, finalPath, myProgressable);
this.recordWriters.put(finalPath, rw);
}
rw.write(actualKey, actualValue);
}
;
public void close(Reporter reporter) throws IOException {
Iterator<String> keys = this.recordWriters.keySet().iterator();
while (keys.hasNext()) {
RecordWriter<K, V> rw = this.recordWriters.get(keys.next());
rw.close(reporter);
}
this.recordWriters.clear();
}
;
};
}
Aggregations