use of org.apache.hadoop.io.WritableComparable in project hive by apache.
the class RCFileMapReduceOutputFormat method getRecordWriter.
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.lib.output.FileOutputFormat#getRecordWriter(org.apache.hadoop.mapreduce.TaskAttemptContext)
*/
@Override
public org.apache.hadoop.mapreduce.RecordWriter<WritableComparable<?>, BytesRefArrayWritable> getRecordWriter(TaskAttemptContext task) throws IOException, InterruptedException {
// FileOutputFormat.getWorkOutputPath takes TaskInputOutputContext instead of
// TaskAttemptContext, so can't use that here
FileOutputCommitter committer = (FileOutputCommitter) getOutputCommitter(task);
Path outputPath = committer.getWorkPath();
FileSystem fs = outputPath.getFileSystem(task.getConfiguration());
if (!fs.exists(outputPath)) {
fs.mkdirs(outputPath);
}
Path file = getDefaultWorkFile(task, "");
CompressionCodec codec = null;
if (getCompressOutput(task)) {
Class<?> codecClass = getOutputCompressorClass(task, DefaultCodec.class);
codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, task.getConfiguration());
}
final RCFile.Writer out = new RCFile.Writer(fs, task.getConfiguration(), file, task, codec);
return new RecordWriter<WritableComparable<?>, BytesRefArrayWritable>() {
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.RecordWriter#write(java.lang.Object, java.lang.Object)
*/
@Override
public void write(WritableComparable<?> key, BytesRefArrayWritable value) throws IOException {
out.append(value);
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.RecordWriter#close(org.apache.hadoop.mapreduce.TaskAttemptContext)
*/
@Override
public void close(TaskAttemptContext task) throws IOException, InterruptedException {
out.close();
}
};
}
use of org.apache.hadoop.io.WritableComparable in project nutch by apache.
the class SegmentReader method getSeqRecords.
private List<Writable> getSeqRecords(Path dir, Text key) throws Exception {
MapFile.Reader[] readers = MapFileOutputFormat.getReaders(dir, getConf());
ArrayList<Writable> res = new ArrayList<>();
Class<?> keyClass = readers[0].getKeyClass();
Class<?> valueClass = readers[0].getValueClass();
if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))
throw new IOException("Incompatible key (" + keyClass.getName() + ")");
WritableComparable aKey = (WritableComparable) keyClass.newInstance();
Writable value = (Writable) valueClass.newInstance();
for (int i = 0; i < readers.length; i++) {
while (readers[i].next(aKey, value)) {
if (aKey.equals(key)) {
res.add(value);
value = (Writable) valueClass.newInstance();
}
}
readers[i].close();
}
return res;
}
use of org.apache.hadoop.io.WritableComparable in project incubator-systemml by apache.
the class SamplingSortMRInputFormat method writePartitionFile.
/**
* Use the input splits to take samples of the input and generate sample
* keys. By default reads 100,000 keys from 10 locations in the input, sorts
* them and picks N-1 keys to generate N equally sized partitions.
*
* @param conf the job to sample
* @param partFile where to write the output file to
* @return index value
* @throws IOException if something goes wrong
* @throws InstantiationException if InstantiationException occurs
* @throws IllegalAccessException if IllegalAccessException occurs
*/
@SuppressWarnings({ "unchecked", "unused", "deprecation" })
public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException {
SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat();
Sampler sampler = new Sampler();
Class<? extends WritableComparable> targetKeyClass;
targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class);
// get input converter information
int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0);
int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0);
// indicate whether the matrix value in this mapper is a matrix cell or a matrix block
int partitions = conf.getNumReduceTasks();
long sampleSize = conf.getLong(SAMPLE_SIZE, 1000);
InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
int samples = Math.min(10, splits.length);
long recordsPerSample = sampleSize / samples;
int sampleStep = splits.length / samples;
// take N samples from different parts of the input
int totalcount = 0;
for (int i = 0; i < samples; i++) {
SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat.getRecordReader(splits[sampleStep * i], conf, null);
int count = 0;
WritableComparable key = (WritableComparable) reader.createKey();
Writable value = (Writable) reader.createValue();
while (reader.next(key, value) && count < recordsPerSample) {
Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0);
inputConverter.setBlockSize(brlen, bclen);
inputConverter.convert(key, value);
while (inputConverter.hasNext()) {
Pair pair = inputConverter.next();
if (pair.getKey() instanceof DoubleWritable) {
sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get()));
} else if (pair.getValue() instanceof MatrixCell) {
sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue()));
} else
throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass());
count++;
}
key = (WritableComparable) reader.createKey();
value = (Writable) reader.createValue();
}
totalcount += count;
}
if (// empty input files
totalcount == 0)
sampler.addValue(new DoubleWritable(0));
FileSystem outFs = partFile.getFileSystem(conf);
if (outFs.exists(partFile)) {
outFs.delete(partFile, false);
}
// note: key value always double/null as expected by partitioner
SequenceFile.Writer writer = null;
int index0 = -1;
try {
writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class);
NullWritable nullValue = NullWritable.get();
int i = 0;
boolean lessthan0 = true;
for (WritableComparable splitValue : sampler.createPartitions(partitions)) {
writer.append(splitValue, nullValue);
if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) {
index0 = i;
lessthan0 = false;
}
i++;
}
if (lessthan0)
index0 = partitions - 1;
} finally {
IOUtilFunctions.closeSilently(writer);
}
return index0;
}
use of org.apache.hadoop.io.WritableComparable in project systemml by apache.
the class SamplingSortMRInputFormat method writePartitionFile.
/**
* Use the input splits to take samples of the input and generate sample
* keys. By default reads 100,000 keys from 10 locations in the input, sorts
* them and picks N-1 keys to generate N equally sized partitions.
*
* @param conf the job to sample
* @param partFile where to write the output file to
* @return index value
* @throws IOException if something goes wrong
* @throws InstantiationException if InstantiationException occurs
* @throws IllegalAccessException if IllegalAccessException occurs
*/
@SuppressWarnings({ "unchecked", "unused", "deprecation" })
public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException {
SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat();
Sampler sampler = new Sampler();
Class<? extends WritableComparable> targetKeyClass;
targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class);
// get input converter information
int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0);
int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0);
// indicate whether the matrix value in this mapper is a matrix cell or a matrix block
int partitions = conf.getNumReduceTasks();
long sampleSize = conf.getLong(SAMPLE_SIZE, 1000);
InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
int samples = Math.min(10, splits.length);
long recordsPerSample = sampleSize / samples;
int sampleStep = splits.length / samples;
// take N samples from different parts of the input
int totalcount = 0;
for (int i = 0; i < samples; i++) {
SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat.getRecordReader(splits[sampleStep * i], conf, null);
int count = 0;
WritableComparable key = (WritableComparable) reader.createKey();
Writable value = (Writable) reader.createValue();
while (reader.next(key, value) && count < recordsPerSample) {
Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0);
inputConverter.setBlockSize(brlen, bclen);
inputConverter.convert(key, value);
while (inputConverter.hasNext()) {
Pair pair = inputConverter.next();
if (pair.getKey() instanceof DoubleWritable) {
sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get()));
} else if (pair.getValue() instanceof MatrixCell) {
sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue()));
} else
throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass());
count++;
}
key = (WritableComparable) reader.createKey();
value = (Writable) reader.createValue();
}
totalcount += count;
}
if (// empty input files
totalcount == 0)
sampler.addValue(new DoubleWritable(0));
FileSystem outFs = partFile.getFileSystem(conf);
if (outFs.exists(partFile)) {
outFs.delete(partFile, false);
}
// note: key value always double/null as expected by partitioner
SequenceFile.Writer writer = null;
int index0 = -1;
try {
writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class);
NullWritable nullValue = NullWritable.get();
int i = 0;
boolean lessthan0 = true;
for (WritableComparable splitValue : sampler.createPartitions(partitions)) {
writer.append(splitValue, nullValue);
if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) {
index0 = i;
lessthan0 = false;
}
i++;
}
if (lessthan0)
index0 = partitions - 1;
} finally {
IOUtilFunctions.closeSilently(writer);
}
return index0;
}
use of org.apache.hadoop.io.WritableComparable in project presto by prestodb.
the class HiveUtil method createRecordReader.
public static RecordReader<?, ?> createRecordReader(Configuration configuration, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns, Map<String, String> customSplitInfo) {
// determine which hive columns we will read
List<HiveColumnHandle> readColumns = ImmutableList.copyOf(filter(columns, column -> column.getColumnType() == REGULAR));
List<Integer> readHiveColumnIndexes = ImmutableList.copyOf(transform(readColumns, HiveColumnHandle::getHiveColumnIndex));
// Tell hive the columns we would like to read, this lets hive optimize reading column oriented files
setReadColumns(configuration, readHiveColumnIndexes);
// Only propagate serialization schema configs by default
Predicate<String> schemaFilter = schemaProperty -> schemaProperty.startsWith("serialization.");
InputFormat<?, ?> inputFormat = getInputFormat(configuration, getInputFormatName(schema), true);
JobConf jobConf = toJobConf(configuration);
FileSplit fileSplit = new FileSplit(path, start, length, (String[]) null);
if (!customSplitInfo.isEmpty() && isHudiRealtimeSplit(customSplitInfo)) {
fileSplit = recreateSplitWithCustomInfo(fileSplit, customSplitInfo);
// Add additional column information for record reader
List<String> readHiveColumnNames = ImmutableList.copyOf(transform(readColumns, HiveColumnHandle::getName));
jobConf.set(READ_COLUMN_NAMES_CONF_STR, Joiner.on(',').join(readHiveColumnNames));
// Remove filter when using customSplitInfo as the record reader requires complete schema configs
schemaFilter = schemaProperty -> true;
}
schema.stringPropertyNames().stream().filter(schemaFilter).forEach(name -> jobConf.set(name, schema.getProperty(name)));
// add Airlift LZO and LZOP to head of codecs list so as to not override existing entries
List<String> codecs = newArrayList(Splitter.on(",").trimResults().omitEmptyStrings().split(jobConf.get("io.compression.codecs", "")));
if (!codecs.contains(LzoCodec.class.getName())) {
codecs.add(0, LzoCodec.class.getName());
}
if (!codecs.contains(LzopCodec.class.getName())) {
codecs.add(0, LzopCodec.class.getName());
}
jobConf.set("io.compression.codecs", codecs.stream().collect(joining(",")));
try {
RecordReader<WritableComparable, Writable> recordReader = (RecordReader<WritableComparable, Writable>) inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL);
int headerCount = getHeaderCount(schema);
// Only skip header rows when the split is at the beginning of the file
if (start == 0 && headerCount > 0) {
Utilities.skipHeader(recordReader, headerCount, recordReader.createKey(), recordReader.createValue());
}
int footerCount = getFooterCount(schema);
if (footerCount > 0) {
recordReader = new FooterAwareRecordReader<>(recordReader, footerCount, jobConf);
}
return recordReader;
} catch (IOException e) {
if (e instanceof TextLineLengthLimitExceededException) {
throw new PrestoException(HIVE_BAD_DATA, "Line too long in text file: " + path, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s", path, start, length, getInputFormatName(schema), firstNonNull(e.getMessage(), e.getClass().getName())), e);
}
}
Aggregations