use of org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter in project hbase by apache.
the class MultiHFileOutputFormat method createMultiHFileRecordWriter.
static <V extends Cell> RecordWriter<ImmutableBytesWritable, V> createMultiHFileRecordWriter(final TaskAttemptContext context) throws IOException {
// Get the path of the output directory
final Path outputPath = FileOutputFormat.getOutputPath(context);
final Path outputDir = new FileOutputCommitter(outputPath, context).getWorkPath();
final Configuration conf = context.getConfiguration();
final FileSystem fs = outputDir.getFileSystem(conf);
// Map of tables to writers
final Map<ImmutableBytesWritable, RecordWriter<ImmutableBytesWritable, V>> tableWriters = new HashMap<>();
return new RecordWriter<ImmutableBytesWritable, V>() {
@Override
public void write(ImmutableBytesWritable tableName, V cell) throws IOException, InterruptedException {
RecordWriter<ImmutableBytesWritable, V> tableWriter = tableWriters.get(tableName);
// if there is new table, verify that table directory exists
if (tableWriter == null) {
// using table name as directory name
final Path tableOutputDir = new Path(outputDir, Bytes.toString(tableName.copyBytes()));
fs.mkdirs(tableOutputDir);
LOG.info("Writing Table '" + tableName.toString() + "' data into following directory" + tableOutputDir.toString());
// Create writer for one specific table
tableWriter = new HFileOutputFormat2.HFileRecordWriter<>(context, tableOutputDir);
// Put table into map
tableWriters.put(tableName, tableWriter);
}
// Write <Row, Cell> into tableWriter
// in the original code, it does not use Row
tableWriter.write(null, cell);
}
@Override
public void close(TaskAttemptContext c) throws IOException, InterruptedException {
for (RecordWriter<ImmutableBytesWritable, V> writer : tableWriters.values()) {
writer.close(c);
}
}
};
}
use of org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter in project hive by apache.
the class HiveHFileOutputFormat method getHiveRecordWriter.
@Override
public RecordWriter getHiveRecordWriter(final JobConf jc, final Path finalOutPath, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, final Progressable progressable) throws IOException {
// Read configuration for the target path, first from jobconf, then from table properties
String hfilePath = getFamilyPath(jc, tableProperties);
if (hfilePath == null) {
throw new RuntimeException("Please set " + HFILE_FAMILY_PATH + " to target location for HFiles");
}
// Target path's last component is also the column family name.
final Path columnFamilyPath = new Path(hfilePath);
final String columnFamilyName = columnFamilyPath.getName();
final byte[] columnFamilyNameBytes = Bytes.toBytes(columnFamilyName);
final Job job = new Job(jc);
setCompressOutput(job, isCompressed);
setOutputPath(job, finalOutPath);
// Create the HFile writer
final org.apache.hadoop.mapreduce.TaskAttemptContext tac = ShimLoader.getHadoopShims().newTaskAttemptContext(job.getConfiguration(), progressable);
final Path outputdir = FileOutputFormat.getOutputPath(tac);
final Path taskAttemptOutputdir = new FileOutputCommitter(outputdir, tac).getWorkPath();
final org.apache.hadoop.mapreduce.RecordWriter<ImmutableBytesWritable, KeyValue> fileWriter = getFileWriter(tac);
// Individual columns are going to be pivoted to HBase cells,
// and for each row, they need to be written out in order
// of column name, so sort the column names now, creating a
// mapping to their column position. However, the first
// column is interpreted as the row key.
String columnList = tableProperties.getProperty("columns");
String[] columnArray = columnList.split(",");
final SortedMap<byte[], Integer> columnMap = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
int i = 0;
for (String columnName : columnArray) {
if (i != 0) {
columnMap.put(Bytes.toBytes(columnName), i);
}
++i;
}
return new RecordWriter() {
@Override
public void close(boolean abort) throws IOException {
try {
fileWriter.close(null);
if (abort) {
return;
}
// Move the hfiles file(s) from the task output directory to the
// location specified by the user.
FileSystem fs = outputdir.getFileSystem(jc);
fs.mkdirs(columnFamilyPath);
Path srcDir = taskAttemptOutputdir;
for (; ; ) {
FileStatus[] files = fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER);
if ((files == null) || (files.length == 0)) {
throw new IOException("No family directories found in " + srcDir);
}
if (files.length != 1) {
throw new IOException("Multiple family directories found in " + srcDir);
}
srcDir = files[0].getPath();
if (srcDir.getName().equals(columnFamilyName)) {
break;
}
if (files[0].isFile()) {
throw new IOException("No family directories found in " + taskAttemptOutputdir + ". " + "The last component in hfile path should match column family name " + columnFamilyName);
}
}
for (FileStatus regionFile : fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER)) {
fs.rename(regionFile.getPath(), new Path(columnFamilyPath, regionFile.getPath().getName()));
}
// Hive actually wants a file as task output (not a directory), so
// replace the empty directory with an empty file to keep it happy.
fs.delete(taskAttemptOutputdir, true);
fs.createNewFile(taskAttemptOutputdir);
} catch (InterruptedException ex) {
throw new IOException(ex);
}
}
private void writeText(Text text) throws IOException {
// Decompose the incoming text row into fields.
String s = text.toString();
String[] fields = s.split("");
assert (fields.length <= (columnMap.size() + 1));
// First field is the row key.
byte[] rowKeyBytes = Bytes.toBytes(fields[0]);
// Remaining fields are cells addressed by column name within row.
for (Map.Entry<byte[], Integer> entry : columnMap.entrySet()) {
byte[] columnNameBytes = entry.getKey();
int iColumn = entry.getValue();
String val;
if (iColumn >= fields.length) {
// trailing blank field
val = "";
} else {
val = fields[iColumn];
if ("\\N".equals(val)) {
// omit nulls
continue;
}
}
byte[] valBytes = Bytes.toBytes(val);
KeyValue kv = new KeyValue(rowKeyBytes, columnFamilyNameBytes, columnNameBytes, valBytes);
try {
fileWriter.write(null, kv);
} catch (IOException e) {
LOG.error("Failed while writing row: " + s);
throw e;
} catch (InterruptedException ex) {
throw new IOException(ex);
}
}
}
private void writePut(PutWritable put) throws IOException {
ImmutableBytesWritable row = new ImmutableBytesWritable(put.getPut().getRow());
SortedMap<byte[], List<Cell>> cells = put.getPut().getFamilyCellMap();
for (Map.Entry<byte[], List<Cell>> entry : cells.entrySet()) {
Collections.sort(entry.getValue(), new CellComparator());
for (Cell c : entry.getValue()) {
try {
fileWriter.write(row, KeyValueUtil.copyToNewKeyValue(c));
} catch (InterruptedException e) {
throw (InterruptedIOException) new InterruptedIOException().initCause(e);
}
}
}
}
@Override
public void write(Writable w) throws IOException {
if (w instanceof Text) {
writeText((Text) w);
} else if (w instanceof PutWritable) {
writePut((PutWritable) w);
} else {
throw new IOException("Unexpected writable " + w);
}
}
};
}
use of org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter in project flink by apache.
the class HadoopOutputFormatBase method open.
/**
* create the temporary output file for hadoop RecordWriter.
* @param taskNumber The number of the parallel instance.
* @param numTasks The number of parallel tasks.
* @throws java.io.IOException
*/
@Override
public void open(int taskNumber, int numTasks) throws IOException {
// enforce sequential open() calls
synchronized (OPEN_MUTEX) {
if (Integer.toString(taskNumber + 1).length() > 6) {
throw new IOException("Task id too large.");
}
this.taskNumber = taskNumber + 1;
// for hadoop 2.2
this.configuration.set("mapreduce.output.basename", "tmp");
TaskAttemptID taskAttemptID = TaskAttemptID.forName("attempt__0000_r_" + String.format("%" + (6 - Integer.toString(taskNumber + 1).length()) + "s", " ").replace(" ", "0") + Integer.toString(taskNumber + 1) + "_0");
this.configuration.set("mapred.task.id", taskAttemptID.toString());
this.configuration.setInt("mapred.task.partition", taskNumber + 1);
// for hadoop 2.2
this.configuration.set("mapreduce.task.attempt.id", taskAttemptID.toString());
this.configuration.setInt("mapreduce.task.partition", taskNumber + 1);
try {
this.context = HadoopUtils.instantiateTaskAttemptContext(this.configuration, taskAttemptID);
this.outputCommitter = this.mapreduceOutputFormat.getOutputCommitter(this.context);
this.outputCommitter.setupJob(HadoopUtils.instantiateJobContext(this.configuration, new JobID()));
} catch (Exception e) {
throw new RuntimeException(e);
}
this.context.getCredentials().addAll(this.credentials);
Credentials currentUserCreds = getCredentialsFromUGI(UserGroupInformation.getCurrentUser());
if (currentUserCreds != null) {
this.context.getCredentials().addAll(currentUserCreds);
}
// compatible for hadoop 2.2.0, the temporary output directory is different from hadoop 1.2.1
if (outputCommitter instanceof FileOutputCommitter) {
this.configuration.set("mapreduce.task.output.dir", ((FileOutputCommitter) this.outputCommitter).getWorkPath().toString());
}
try {
this.recordWriter = this.mapreduceOutputFormat.getRecordWriter(this.context);
} catch (InterruptedException e) {
throw new IOException("Could not create RecordWriter.", e);
}
}
}
use of org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter in project hive by apache.
the class RCFileMapReduceOutputFormat method getRecordWriter.
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.lib.output.FileOutputFormat#getRecordWriter(org.apache.hadoop.mapreduce.TaskAttemptContext)
*/
@Override
public org.apache.hadoop.mapreduce.RecordWriter<WritableComparable<?>, BytesRefArrayWritable> getRecordWriter(TaskAttemptContext task) throws IOException, InterruptedException {
//FileOutputFormat.getWorkOutputPath takes TaskInputOutputContext instead of
//TaskAttemptContext, so can't use that here
FileOutputCommitter committer = (FileOutputCommitter) getOutputCommitter(task);
Path outputPath = committer.getWorkPath();
FileSystem fs = outputPath.getFileSystem(task.getConfiguration());
if (!fs.exists(outputPath)) {
fs.mkdirs(outputPath);
}
Path file = getDefaultWorkFile(task, "");
CompressionCodec codec = null;
if (getCompressOutput(task)) {
Class<?> codecClass = getOutputCompressorClass(task, DefaultCodec.class);
codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, task.getConfiguration());
}
final RCFile.Writer out = new RCFile.Writer(fs, task.getConfiguration(), file, task, codec);
return new RecordWriter<WritableComparable<?>, BytesRefArrayWritable>() {
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.RecordWriter#write(java.lang.Object, java.lang.Object)
*/
@Override
public void write(WritableComparable<?> key, BytesRefArrayWritable value) throws IOException {
out.append(value);
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.RecordWriter#close(org.apache.hadoop.mapreduce.TaskAttemptContext)
*/
@Override
public void close(TaskAttemptContext task) throws IOException, InterruptedException {
out.close();
}
};
}
use of org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter in project hive by apache.
the class DynamicPartitionFileRecordWriterContainer method getLocalFileWriter.
@Override
protected LocalFileWriter getLocalFileWriter(HCatRecord value) throws IOException, HCatException {
OutputJobInfo localJobInfo = null;
// Calculate which writer to use from the remaining values - this needs to
// be done before we delete cols.
List<String> dynamicPartValues = new ArrayList<String>();
for (Integer colToAppend : dynamicPartCols) {
Object partitionValue = value.get(colToAppend);
dynamicPartValues.add(partitionValue == null ? HIVE_DEFAULT_PARTITION_VALUE : partitionValue.toString());
}
String dynKey = dynamicPartValues.toString();
if (!baseDynamicWriters.containsKey(dynKey)) {
if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) {
throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS, "Number of dynamic partitions being created " + "exceeds configured max allowable partitions[" + maxDynamicPartitions + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname + "] if needed.");
}
org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil.createTaskAttemptContext(context);
configureDynamicStorageHandler(currTaskContext, dynamicPartValues);
localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext.getConfiguration());
// Setup serDe.
AbstractSerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(), currTaskContext.getJobConf());
try {
InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo);
} catch (SerDeException e) {
throw new IOException("Failed to initialize SerDe", e);
}
// create base OutputFormat
org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils.newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf());
// We are skipping calling checkOutputSpecs() for each partition
// As it can throw a FileAlreadyExistsException when more than one
// mapper is writing to a partition.
// See HCATALOG-490, also to avoid contacting the namenode for each new
// FileOutputFormat instance.
// In general this should be ok for most FileOutputFormat implementations
// but may become an issue for cases when the method is used to perform
// other setup tasks.
// Get Output Committer
org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf().getOutputCommitter();
// Create currJobContext the latest so it gets all the config changes
org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil.createJobContext(currTaskContext);
// Set up job.
baseOutputCommitter.setupJob(currJobContext);
// Recreate to refresh jobConf of currTask context.
currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(), currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible());
// Set temp location.
currTaskContext.getConfiguration().set("mapred.work.output.dir", new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath().toString());
// Set up task.
baseOutputCommitter.setupTask(currTaskContext);
Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir"));
Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext, currTaskContext.getConfiguration().get("mapreduce.output.basename", "part"), ""));
RecordWriter baseRecordWriter = baseOF.getRecordWriter(parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(), childPath.toString(), InternalUtil.createReporter(currTaskContext));
baseDynamicWriters.put(dynKey, baseRecordWriter);
baseDynamicSerDe.put(dynKey, currSerDe);
baseDynamicCommitters.put(dynKey, baseOutputCommitter);
dynamicContexts.put(dynKey, currTaskContext);
dynamicObjectInspectors.put(dynKey, InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema()));
dynamicOutputJobInfo.put(dynKey, HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey).getConfiguration()));
}
return new LocalFileWriter(baseDynamicWriters.get(dynKey), dynamicObjectInspectors.get(dynKey), baseDynamicSerDe.get(dynKey), dynamicOutputJobInfo.get(dynKey));
}
Aggregations