use of org.apache.hadoop.mapreduce.TaskAttemptContext in project hadoop by apache.
the class TestGridMixClasses method testLoadJobLoadRecordReader.
/*
* test LoadRecordReader. It class reads data from some files.
*/
@Test(timeout = 3000)
public void testLoadJobLoadRecordReader() throws Exception {
LoadJob.LoadRecordReader test = new LoadJob.LoadRecordReader();
Configuration conf = new Configuration();
FileSystem fs1 = mock(FileSystem.class);
when(fs1.open((Path) anyObject())).thenReturn(new FakeFSDataInputStream(new FakeInputStream()));
Path p1 = mock(Path.class);
when(p1.getFileSystem((JobConf) anyObject())).thenReturn(fs1);
FileSystem fs2 = mock(FileSystem.class);
when(fs2.open((Path) anyObject())).thenReturn(new FakeFSDataInputStream(new FakeInputStream()));
Path p2 = mock(Path.class);
when(p2.getFileSystem((JobConf) anyObject())).thenReturn(fs2);
Path[] paths = { p1, p2 };
long[] start = { 0, 0 };
long[] lengths = { 1000, 1000 };
String[] locations = { "temp1", "temp2" };
CombineFileSplit cfsplit = new CombineFileSplit(paths, start, lengths, locations);
double[] reduceBytes = { 100, 100 };
double[] reduceRecords = { 2, 2 };
long[] reduceOutputBytes = { 500, 500 };
long[] reduceOutputRecords = { 2, 2 };
ResourceUsageMetrics metrics = new ResourceUsageMetrics();
ResourceUsageMetrics[] rMetrics = { new ResourceUsageMetrics(), new ResourceUsageMetrics() };
LoadSplit input = new LoadSplit(cfsplit, 2, 3, 1500L, 2L, 3000L, 2L, reduceBytes, reduceRecords, reduceOutputBytes, reduceOutputRecords, metrics, rMetrics);
TaskAttemptID taskId = new TaskAttemptID();
TaskAttemptContext ctx = new TaskAttemptContextImpl(conf, taskId);
test.initialize(input, ctx);
GridmixRecord gr = test.getCurrentValue();
int counter = 0;
while (test.nextKeyValue()) {
gr = test.getCurrentValue();
if (counter == 0) {
// read first file
assertEquals(0.5, test.getProgress(), 0.001);
} else if (counter == 1) {
// read second file
assertEquals(1.0, test.getProgress(), 0.001);
}
//
assertEquals(1000, gr.getSize());
counter++;
}
assertEquals(1000, gr.getSize());
// Two files have been read
assertEquals(2, counter);
test.close();
}
use of org.apache.hadoop.mapreduce.TaskAttemptContext in project hadoop by apache.
the class TestDistCacheEmulation method validateSetupGenDC.
/**
* Validate setupGenerateDistCacheData by validating <li>permissions of the
* distributed cache directories and <li>content of the generated sequence
* file. This includes validation of dist cache file paths and their file
* sizes.
*/
private void validateSetupGenDC(Configuration jobConf, long[] sortedFileSizes) throws IOException, InterruptedException {
// build things needed for validation
long sumOfFileSizes = 0;
for (int i = 0; i < sortedFileSizes.length; i++) {
sumOfFileSizes += sortedFileSizes[i];
}
FileSystem fs = FileSystem.get(jobConf);
assertEquals("Number of distributed cache files to be generated is wrong.", sortedFileSizes.length, jobConf.getInt(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_COUNT, -1));
assertEquals("Total size of dist cache files to be generated is wrong.", sumOfFileSizes, jobConf.getLong(GenerateDistCacheData.GRIDMIX_DISTCACHE_BYTE_COUNT, -1));
Path filesListFile = new Path(jobConf.get(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_LIST));
FileStatus stat = fs.getFileStatus(filesListFile);
assertEquals("Wrong permissions of dist Cache files list file " + filesListFile, new FsPermission((short) 0644), stat.getPermission());
InputSplit split = new FileSplit(filesListFile, 0, stat.getLen(), (String[]) null);
TaskAttemptContext taskContext = MapReduceTestUtil.createDummyMapTaskAttemptContext(jobConf);
RecordReader<LongWritable, BytesWritable> reader = new GenerateDistCacheData.GenDCDataFormat().createRecordReader(split, taskContext);
MapContext<LongWritable, BytesWritable, NullWritable, BytesWritable> mapContext = new MapContextImpl<LongWritable, BytesWritable, NullWritable, BytesWritable>(jobConf, taskContext.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
reader.initialize(split, mapContext);
// start validating setupGenerateDistCacheData
doValidateSetupGenDC(reader, fs, sortedFileSizes);
}
use of org.apache.hadoop.mapreduce.TaskAttemptContext in project presto by prestodb.
the class ParquetHiveRecordCursor method createParquetRecordReader.
private ParquetRecordReader<FakeParquetRecord> createParquetRecordReader(HdfsEnvironment hdfsEnvironment, String sessionUser, Configuration configuration, Path path, long start, long length, List<HiveColumnHandle> columns, boolean useParquetColumnNames, TypeManager typeManager, boolean predicatePushdownEnabled, TupleDomain<HiveColumnHandle> effectivePredicate) {
ParquetDataSource dataSource = null;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(sessionUser, path, configuration);
dataSource = buildHdfsParquetDataSource(fileSystem, path, start, length);
ParquetMetadata parquetMetadata = hdfsEnvironment.doAs(sessionUser, () -> ParquetFileReader.readFooter(configuration, path, NO_FILTER));
List<BlockMetaData> blocks = parquetMetadata.getBlocks();
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
PrestoReadSupport readSupport = new PrestoReadSupport(useParquetColumnNames, columns, fileSchema);
List<parquet.schema.Type> fields = columns.stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getParquetType(column, fileSchema, useParquetColumnNames)).filter(Objects::nonNull).collect(toList());
MessageType requestedSchema = new MessageType(fileSchema.getName(), fields);
LongArrayList offsets = new LongArrayList(blocks.size());
for (BlockMetaData block : blocks) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= start && firstDataPage < start + length) {
if (predicatePushdownEnabled) {
ParquetPredicate parquetPredicate = buildParquetPredicate(columns, effectivePredicate, fileMetaData.getSchema(), typeManager);
if (predicateMatches(parquetPredicate, block, dataSource, requestedSchema, effectivePredicate)) {
offsets.add(block.getStartingPos());
}
} else {
offsets.add(block.getStartingPos());
}
}
}
ParquetInputSplit split = new ParquetInputSplit(path, start, start + length, length, null, offsets.toLongArray());
TaskAttemptContext taskContext = ContextUtil.newTaskAttemptContext(configuration, new TaskAttemptID());
return hdfsEnvironment.doAs(sessionUser, () -> {
ParquetRecordReader<FakeParquetRecord> realReader = new PrestoParquetRecordReader(readSupport);
realReader.initialize(split, taskContext);
return realReader;
});
} catch (Exception e) {
Throwables.propagateIfInstanceOf(e, PrestoException.class);
if (e instanceof InterruptedException) {
Thread.currentThread().interrupt();
throw Throwables.propagate(e);
}
String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e.getClass().getSimpleName().equals("BlockMissingException")) {
throw new PrestoException(HIVE_MISSING_DATA, message, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
} finally {
if (dataSource != null) {
try {
dataSource.close();
} catch (IOException ignored) {
}
}
}
}
use of org.apache.hadoop.mapreduce.TaskAttemptContext in project mongo-hadoop by mongodb.
the class GridFSInputFormatTest method mockTaskAttemptContext.
private static TaskAttemptContext mockTaskAttemptContext(final Configuration conf) {
TaskAttemptContext context = mock(TaskAttemptContext.class);
when(context.getConfiguration()).thenReturn(conf);
return context;
}
use of org.apache.hadoop.mapreduce.TaskAttemptContext in project mongo-hadoop by mongodb.
the class GridFSInputFormatTest method testReadBinaryFiles.
@Test
public void testReadBinaryFiles() throws IOException, InterruptedException, URISyntaxException {
Configuration conf = getConfiguration();
MongoConfigUtil.setQuery(conf, new BasicDBObject("filename", "orders.bson"));
MongoConfigUtil.setGridFSWholeFileSplit(conf, true);
MongoConfigUtil.setGridFSReadBinary(conf, true);
JobContext context = mockJobContext(conf);
TaskAttemptContext taskContext = mockTaskAttemptContext(conf);
List<InputSplit> splits = inputFormat.getSplits(context);
assertEquals(1, splits.size());
int i = 0;
byte[] buff = null;
for (InputSplit split : splits) {
GridFSInputFormat.GridFSBinaryRecordReader reader = new GridFSInputFormat.GridFSBinaryRecordReader();
reader.initialize(split, taskContext);
for (; reader.nextKeyValue(); ++i) {
buff = new byte[reader.getCurrentValue().getLength()];
// BytesWritable.copyBytes does not exist in Hadoop 1.2
System.arraycopy(reader.getCurrentValue().getBytes(), 0, buff, 0, buff.length);
}
}
// Only one record to read on the split.
assertEquals(1, i);
assertNotNull(buff);
assertEquals(bson.getLength(), buff.length);
}
Aggregations