use of org.apache.hadoop.mapred.FileSplit in project asterixdb by apache.
the class SchedulerTest method testSchedulerSmallerHDFS.
* Test the case where the HDFS cluster is a larger than the Hyracks cluster
* @throws Exception
public void testSchedulerSmallerHDFS() throws Exception {
Map<String, NodeControllerInfo> ncNameToNcInfos = TestUtils.generateNodeControllerInfo(6, "nc", "10.0.0.", 5099, 5098, 5097);
InputSplit[] fileSplits = new InputSplit[12];
fileSplits[0] = new FileSplit(new Path("part-1"), 0, 0, new String[] { "", "", "" });
fileSplits[1] = new FileSplit(new Path("part-2"), 0, 0, new String[] { "", "", "" });
fileSplits[2] = new FileSplit(new Path("part-3"), 0, 0, new String[] { "", "", "" });
fileSplits[3] = new FileSplit(new Path("part-4"), 0, 0, new String[] { "", "", "" });
fileSplits[4] = new FileSplit(new Path("part-5"), 0, 0, new String[] { "", "", "" });
fileSplits[5] = new FileSplit(new Path("part-6"), 0, 0, new String[] { "", "", "" });
fileSplits[6] = new FileSplit(new Path("part-7"), 0, 0, new String[] { "", "", "" });
fileSplits[7] = new FileSplit(new Path("part-8"), 0, 0, new String[] { "", "", "" });
fileSplits[8] = new FileSplit(new Path("part-9"), 0, 0, new String[] { "", "", "" });
fileSplits[9] = new FileSplit(new Path("part-10"), 0, 0, new String[] { "", "", "" });
fileSplits[10] = new FileSplit(new Path("part-11"), 0, 0, new String[] { "", "", "" });
fileSplits[11] = new FileSplit(new Path("part-12"), 0, 0, new String[] { "", "", "" });
String[] expectedResults = new String[] { "nc1", "nc4", "nc4", "nc1", "nc3", "nc2", "nc2", "nc3", "nc5", "nc6", "nc5", "nc6" };
Scheduler scheduler = new Scheduler(ncNameToNcInfos);
String[] locationConstraints = scheduler.getLocationConstraints(fileSplits);
for (int i = 0; i < locationConstraints.length; i++) {
Assert.assertEquals(locationConstraints[i], expectedResults[i]);
ClusterTopology topology = parseTopology();
scheduler = new Scheduler(ncNameToNcInfos, topology);
locationConstraints = scheduler.getLocationConstraints(fileSplits);
for (int i = 0; i < locationConstraints.length; i++) {
Assert.assertEquals(locationConstraints[i], expectedResults[i]);
use of org.apache.hadoop.mapred.FileSplit in project asterixdb by apache.
the class SchedulerTest method testSchedulerSimple.
* Test the scheduler for the case when the Hyracks cluster is the HDFS cluster
* @throws Exception
public void testSchedulerSimple() throws Exception {
Map<String, NodeControllerInfo> ncNameToNcInfos = TestUtils.generateNodeControllerInfo(6, "nc", "10.0.0.", 5099, 5098, 5097);
InputSplit[] fileSplits = new InputSplit[6];
fileSplits[0] = new FileSplit(new Path("part-1"), 0, 0, new String[] { "", "", "" });
fileSplits[1] = new FileSplit(new Path("part-2"), 0, 0, new String[] { "", "", "" });
fileSplits[2] = new FileSplit(new Path("part-3"), 0, 0, new String[] { "", "", "" });
fileSplits[3] = new FileSplit(new Path("part-4"), 0, 0, new String[] { "", "", "" });
fileSplits[4] = new FileSplit(new Path("part-5"), 0, 0, new String[] { "", "", "" });
fileSplits[5] = new FileSplit(new Path("part-6"), 0, 0, new String[] { "", "", "" });
String[] expectedResults = new String[] { "nc1", "nc4", "nc6", "nc2", "nc3", "nc5" };
Scheduler scheduler = new Scheduler(ncNameToNcInfos);
String[] locationConstraints = scheduler.getLocationConstraints(fileSplits);
for (int i = 0; i < locationConstraints.length; i++) {
Assert.assertEquals(locationConstraints[i], expectedResults[i]);
ClusterTopology topology = parseTopology();
scheduler = new Scheduler(ncNameToNcInfos, topology);
locationConstraints = scheduler.getLocationConstraints(fileSplits);
for (int i = 0; i < locationConstraints.length; i++) {
Assert.assertEquals(locationConstraints[i], expectedResults[i]);
use of org.apache.hadoop.mapred.FileSplit in project asterixdb by apache.
the class SchedulerTest method testSchedulerLargerHDFS.
* Test the case where the HDFS cluster is a larger than the Hyracks cluster
* @throws Exception
public void testSchedulerLargerHDFS() throws Exception {
int dataPort = 5099;
int resultPort = 5098;
int messagingPort = 5097;
Map<String, NodeControllerInfo> ncNameToNcInfos = TestUtils.generateNodeControllerInfo(4, "nc", "10.0.0.", dataPort, resultPort, messagingPort);
ncNameToNcInfos.put("nc7", new NodeControllerInfo("nc7", NodeStatus.ALIVE, new NetworkAddress("", dataPort), new NetworkAddress("", resultPort), new NetworkAddress("", messagingPort), 2));
ncNameToNcInfos.put("nc12", new NodeControllerInfo("nc12", NodeStatus.ALIVE, new NetworkAddress("", dataPort), new NetworkAddress("", resultPort), new NetworkAddress("", messagingPort), 2));
InputSplit[] fileSplits = new InputSplit[12];
fileSplits[0] = new FileSplit(new Path("part-1"), 0, 0, new String[] { "", "", "" });
fileSplits[1] = new FileSplit(new Path("part-2"), 0, 0, new String[] { "", "", "" });
fileSplits[2] = new FileSplit(new Path("part-3"), 0, 0, new String[] { "", "", "" });
fileSplits[3] = new FileSplit(new Path("part-4"), 0, 0, new String[] { "", "", "" });
fileSplits[4] = new FileSplit(new Path("part-5"), 0, 0, new String[] { "", "", "" });
fileSplits[5] = new FileSplit(new Path("part-6"), 0, 0, new String[] { "", "", "" });
fileSplits[6] = new FileSplit(new Path("part-7"), 0, 0, new String[] { "", "", "" });
fileSplits[7] = new FileSplit(new Path("part-8"), 0, 0, new String[] { "", "", "" });
fileSplits[8] = new FileSplit(new Path("part-12"), 0, 0, new String[] { "", "", "" });
fileSplits[9] = new FileSplit(new Path("part-10"), 0, 0, new String[] { "", "", "" });
fileSplits[10] = new FileSplit(new Path("part-11"), 0, 0, new String[] { "", "", "" });
fileSplits[11] = new FileSplit(new Path("part-9"), 0, 0, new String[] { "", "", "" });
Scheduler scheduler = new Scheduler(ncNameToNcInfos);
String[] locationConstraints = scheduler.getLocationConstraints(fileSplits);
String[] expectedResults = new String[] { "nc1", "nc4", "nc4", "nc1", "nc3", "nc2", "nc2", "nc3", "nc12", "nc7", "nc7", "nc12" };
for (int i = 0; i < locationConstraints.length; i++) {
Assert.assertEquals(locationConstraints[i], expectedResults[i]);
expectedResults = new String[] { "nc1", "nc4", "nc4", "nc1", "nc3", "nc2", "nc2", "nc3", "nc7", "nc12", "nc7", "nc12" };
ClusterTopology topology = parseTopology();
scheduler = new Scheduler(ncNameToNcInfos, topology);
locationConstraints = scheduler.getLocationConstraints(fileSplits);
for (int i = 0; i < locationConstraints.length; i++) {
Assert.assertEquals(locationConstraints[i], expectedResults[i]);
use of org.apache.hadoop.mapred.FileSplit in project asterixdb by apache.
the class SchedulerTest method testSchedulerSmallerHDFSOdd.
* Test the case where the HDFS cluster is a larger than the Hyracks cluster
* @throws Exception
public void testSchedulerSmallerHDFSOdd() throws Exception {
Map<String, NodeControllerInfo> ncNameToNcInfos = TestUtils.generateNodeControllerInfo(6, "nc", "10.0.0.", 5099, 5098, 5097);
InputSplit[] fileSplits = new InputSplit[13];
fileSplits[0] = new FileSplit(new Path("part-1"), 0, 0, new String[] { "", "", "" });
fileSplits[1] = new FileSplit(new Path("part-2"), 0, 0, new String[] { "", "", "" });
fileSplits[2] = new FileSplit(new Path("part-3"), 0, 0, new String[] { "", "", "" });
fileSplits[3] = new FileSplit(new Path("part-4"), 0, 0, new String[] { "", "", "" });
fileSplits[4] = new FileSplit(new Path("part-5"), 0, 0, new String[] { "", "", "" });
fileSplits[5] = new FileSplit(new Path("part-6"), 0, 0, new String[] { "", "", "" });
fileSplits[6] = new FileSplit(new Path("part-7"), 0, 0, new String[] { "", "", "" });
fileSplits[7] = new FileSplit(new Path("part-8"), 0, 0, new String[] { "", "", "" });
fileSplits[8] = new FileSplit(new Path("part-9"), 0, 0, new String[] { "", "", "" });
fileSplits[9] = new FileSplit(new Path("part-10"), 0, 0, new String[] { "", "", "" });
fileSplits[10] = new FileSplit(new Path("part-11"), 0, 0, new String[] { "", "", "" });
fileSplits[11] = new FileSplit(new Path("part-12"), 0, 0, new String[] { "", "", "" });
fileSplits[12] = new FileSplit(new Path("part-13"), 0, 0, new String[] { "", "", "" });
String[] expectedResults = new String[] { "nc1", "nc4", "nc4", "nc1", "nc3", "nc2", "nc2", "nc3", "nc5", "nc1", "nc5", "nc2", "nc4" };
Scheduler scheduler = new Scheduler(ncNameToNcInfos);
String[] locationConstraints = scheduler.getLocationConstraints(fileSplits);
for (int i = 0; i < locationConstraints.length; i++) {
Assert.assertEquals(locationConstraints[i], expectedResults[i]);
ClusterTopology topology = parseTopology();
scheduler = new Scheduler(ncNameToNcInfos, topology);
locationConstraints = scheduler.getLocationConstraints(fileSplits);
for (int i = 0; i < locationConstraints.length; i++) {
Assert.assertEquals(locationConstraints[i], expectedResults[i]);
use of org.apache.hadoop.mapred.FileSplit in project drill by apache.
the class HiveDrillNativeScanBatchCreator method getBatch.
public ScanBatch getBatch(FragmentContext context, HiveDrillNativeParquetSubScan config, List<RecordBatch> children) throws ExecutionSetupException {
final HiveTableWithColumnCache table = config.getTable();
final List<InputSplit> splits = config.getInputSplits();
final List<HivePartition> partitions = config.getPartitions();
final List<SchemaPath> columns = config.getColumns();
final String partitionDesignator = context.getOptions().getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL).string_val;
List<Map<String, String>> implicitColumns = Lists.newLinkedList();
boolean selectAllQuery = AbstractRecordReader.isStarQuery(columns);
final boolean hasPartitions = (partitions != null && partitions.size() > 0);
final List<String[]> partitionColumns = Lists.newArrayList();
final List<Integer> selectedPartitionColumns = Lists.newArrayList();
List<SchemaPath> newColumns = columns;
if (!selectAllQuery) {
// Separate out the partition and non-partition columns. Non-partition columns are passed directly to the
// ParquetRecordReader. Partition columns are passed to ScanBatch.
newColumns = Lists.newArrayList();
Pattern pattern = Pattern.compile(String.format("%s[0-9]+", partitionDesignator));
for (SchemaPath column : columns) {
Matcher m = pattern.matcher(column.getAsUnescapedPath());
if (m.matches()) {
} else {
final OperatorContext oContext = context.newOperatorContext(config);
int currentPartitionIndex = 0;
final List<RecordReader> readers = Lists.newArrayList();
final HiveConf conf = config.getHiveConf();
// TODO: In future we can get this cache from Metadata cached on filesystem.
final Map<String, ParquetMetadata> footerCache = Maps.newHashMap();
Map<String, String> mapWithMaxColumns = Maps.newLinkedHashMap();
try {
for (InputSplit split : splits) {
final FileSplit fileSplit = (FileSplit) split;
final Path finalPath = fileSplit.getPath();
final JobConf cloneJob = new ProjectionPusher().pushProjectionsAndFilters(new JobConf(conf), finalPath.getParent());
final FileSystem fs = finalPath.getFileSystem(cloneJob);
ParquetMetadata parquetMetadata = footerCache.get(finalPath.toString());
if (parquetMetadata == null) {
parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath);
footerCache.put(finalPath.toString(), parquetMetadata);
final List<Integer> rowGroupNums = getRowGroupNumbersFromFileSplit(fileSplit, parquetMetadata);
for (int rowGroupNum : rowGroupNums) {
//DRILL-5009 : Skip the row group if the row count is zero
if (parquetMetadata.getBlocks().get(rowGroupNum).getRowCount() == 0) {
// Drill has only ever written a single row group per file, only detect corruption
// in the first row group
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(parquetMetadata, config.getColumns(), true);
if (logger.isDebugEnabled()) {
readers.add(new ParquetRecordReader(context, Path.getPathWithoutSchemeAndAuthority(finalPath).toString(), rowGroupNum, fs, CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(oContext.getAllocator()), 0), parquetMetadata, newColumns, containsCorruptDates));
Map<String, String> implicitValues = Maps.newLinkedHashMap();
if (hasPartitions) {
List<String> values = partitions.get(currentPartitionIndex).getValues();
for (int i = 0; i < values.size(); i++) {
if (selectAllQuery || selectedPartitionColumns.contains(i)) {
implicitValues.put(partitionDesignator + i, values.get(i));
if (implicitValues.size() > mapWithMaxColumns.size()) {
mapWithMaxColumns = implicitValues;
} catch (final IOException | RuntimeException e) {
AutoCloseables.close(e, readers);
throw new ExecutionSetupException("Failed to create RecordReaders. " + e.getMessage(), e);
// all readers should have the same number of implicit columns, add missing ones with value null
mapWithMaxColumns = Maps.transformValues(mapWithMaxColumns, Functions.constant((String) null));
for (Map<String, String> map : implicitColumns) {
map.putAll(Maps.difference(map, mapWithMaxColumns).entriesOnlyOnRight());
// create an empty RecordReader to output the schema
if (readers.size() == 0) {
readers.add(new HiveDefaultReader(table, null, null, columns, context, conf, ImpersonationUtil.createProxyUgi(config.getUserName(), context.getQueryUserName())));
return new ScanBatch(config, context, oContext, readers.iterator(), implicitColumns);