Examples with InputSplit - org.apache.hadoop.mapred.InputSplit

Example 26 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project phoenix by apache.

the class PhoenixInputFormat method generateSplits.

private List<InputSplit> generateSplits(final JobConf jobConf, final QueryPlan qplan, final List<KeyRange> splits, String query) throws IOException {
    Preconditions.checkNotNull(qplan);
    Preconditions.checkNotNull(splits);
    final List<InputSplit> psplits = Lists.newArrayListWithExpectedSize(splits.size());
    Path[] tablePaths = FileInputFormat.getInputPaths(ShimLoader.getHadoopShims().newJobContext(new Job(jobConf)));
    boolean splitByStats = jobConf.getBoolean(PhoenixStorageHandlerConstants.SPLIT_BY_STATS, false);
    setScanCacheSize(jobConf);
    // Adding Localization
    HConnection connection = HConnectionManager.createConnection(PhoenixConnectionUtil.getConfiguration(jobConf));
    RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf(qplan.getTableRef().getTable().getPhysicalName().toString()));
    RegionSizeCalculator sizeCalculator = new RegionSizeCalculator(regionLocator, connection.getAdmin());
    for (List<Scan> scans : qplan.getScans()) {
        PhoenixInputSplit inputSplit;
        HRegionLocation location = regionLocator.getRegionLocation(scans.get(0).getStartRow(), false);
        long regionSize = sizeCalculator.getRegionSize(location.getRegionInfo().getRegionName());
        String regionLocation = PhoenixStorageHandlerUtil.getRegionLocation(location, LOG);
        if (splitByStats) {
            for (Scan aScan : scans) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Split for  scan : " + aScan + "with scanAttribute : " + aScan.getAttributesMap() + " [scanCache, cacheBlock, scanBatch] : [" + aScan.getCaching() + ", " + aScan.getCacheBlocks() + ", " + aScan.getBatch() + "] and  regionLocation : " + regionLocation);
                }
                inputSplit = new PhoenixInputSplit(Lists.newArrayList(aScan), tablePaths[0], regionLocation, regionSize);
                inputSplit.setQuery(query);
                psplits.add(inputSplit);
            }
        } else {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Scan count[" + scans.size() + "] : " + Bytes.toStringBinary(scans.get(0).getStartRow()) + " ~ " + Bytes.toStringBinary(scans.get(scans.size() - 1).getStopRow()));
                LOG.debug("First scan : " + scans.get(0) + "with scanAttribute : " + scans.get(0).getAttributesMap() + " [scanCache, cacheBlock, scanBatch] : " + "[" + scans.get(0).getCaching() + ", " + scans.get(0).getCacheBlocks() + ", " + scans.get(0).getBatch() + "] and  regionLocation : " + regionLocation);
                for (int i = 0, limit = scans.size(); i < limit; i++) {
                    LOG.debug("EXPECTED_UPPER_REGION_KEY[" + i + "] : " + Bytes.toStringBinary(scans.get(i).getAttribute(BaseScannerRegionObserver.EXPECTED_UPPER_REGION_KEY)));
                }
            }
            inputSplit = new PhoenixInputSplit(scans, tablePaths[0], regionLocation, regionSize);
            inputSplit.setQuery(query);
            psplits.add(inputSplit);
        }
    }
    return psplits;
}

Also used : Path(org.apache.hadoop.fs.Path) RegionLocator(org.apache.hadoop.hbase.client.RegionLocator) RegionSizeCalculator(org.apache.hadoop.hbase.util.RegionSizeCalculator) HConnection(org.apache.hadoop.hbase.client.HConnection) HRegionLocation(org.apache.hadoop.hbase.HRegionLocation) Scan(org.apache.hadoop.hbase.client.Scan) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 27 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project drill by apache.

the class HiveDrillNativeScanBatchCreator method getBatch.

@Override
public ScanBatch getBatch(FragmentContext context, HiveDrillNativeParquetSubScan config, List<RecordBatch> children) throws ExecutionSetupException {
    final HiveTableWithColumnCache table = config.getTable();
    final List<InputSplit> splits = config.getInputSplits();
    final List<HivePartition> partitions = config.getPartitions();
    final List<SchemaPath> columns = config.getColumns();
    final String partitionDesignator = context.getOptions().getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL).string_val;
    List<Map<String, String>> implicitColumns = Lists.newLinkedList();
    boolean selectAllQuery = AbstractRecordReader.isStarQuery(columns);
    final boolean hasPartitions = (partitions != null && partitions.size() > 0);
    final List<String[]> partitionColumns = Lists.newArrayList();
    final List<Integer> selectedPartitionColumns = Lists.newArrayList();
    List<SchemaPath> newColumns = columns;
    if (!selectAllQuery) {
        // Separate out the partition and non-partition columns. Non-partition columns are passed directly to the
        // ParquetRecordReader. Partition columns are passed to ScanBatch.
        newColumns = Lists.newArrayList();
        Pattern pattern = Pattern.compile(String.format("%s[0-9]+", partitionDesignator));
        for (SchemaPath column : columns) {
            Matcher m = pattern.matcher(column.getAsUnescapedPath());
            if (m.matches()) {
                selectedPartitionColumns.add(Integer.parseInt(column.getAsUnescapedPath().substring(partitionDesignator.length())));
            } else {
                newColumns.add(column);
            }
        }
    }
    final OperatorContext oContext = context.newOperatorContext(config);
    int currentPartitionIndex = 0;
    final List<RecordReader> readers = Lists.newArrayList();
    final HiveConf conf = config.getHiveConf();
    // TODO: In future we can get this cache from Metadata cached on filesystem.
    final Map<String, ParquetMetadata> footerCache = Maps.newHashMap();
    Map<String, String> mapWithMaxColumns = Maps.newLinkedHashMap();
    try {
        for (InputSplit split : splits) {
            final FileSplit fileSplit = (FileSplit) split;
            final Path finalPath = fileSplit.getPath();
            final JobConf cloneJob = new ProjectionPusher().pushProjectionsAndFilters(new JobConf(conf), finalPath.getParent());
            final FileSystem fs = finalPath.getFileSystem(cloneJob);
            ParquetMetadata parquetMetadata = footerCache.get(finalPath.toString());
            if (parquetMetadata == null) {
                parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath);
                footerCache.put(finalPath.toString(), parquetMetadata);
            }
            final List<Integer> rowGroupNums = getRowGroupNumbersFromFileSplit(fileSplit, parquetMetadata);
            for (int rowGroupNum : rowGroupNums) {
                //DRILL-5009 : Skip the row group if the row count is zero
                if (parquetMetadata.getBlocks().get(rowGroupNum).getRowCount() == 0) {
                    continue;
                }
                // Drill has only ever written a single row group per file, only detect corruption
                // in the first row group
                ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(parquetMetadata, config.getColumns(), true);
                if (logger.isDebugEnabled()) {
                    logger.debug(containsCorruptDates.toString());
                }
                readers.add(new ParquetRecordReader(context, Path.getPathWithoutSchemeAndAuthority(finalPath).toString(), rowGroupNum, fs, CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(oContext.getAllocator()), 0), parquetMetadata, newColumns, containsCorruptDates));
                Map<String, String> implicitValues = Maps.newLinkedHashMap();
                if (hasPartitions) {
                    List<String> values = partitions.get(currentPartitionIndex).getValues();
                    for (int i = 0; i < values.size(); i++) {
                        if (selectAllQuery || selectedPartitionColumns.contains(i)) {
                            implicitValues.put(partitionDesignator + i, values.get(i));
                        }
                    }
                }
                implicitColumns.add(implicitValues);
                if (implicitValues.size() > mapWithMaxColumns.size()) {
                    mapWithMaxColumns = implicitValues;
                }
            }
            currentPartitionIndex++;
        }
    } catch (final IOException | RuntimeException e) {
        AutoCloseables.close(e, readers);
        throw new ExecutionSetupException("Failed to create RecordReaders. " + e.getMessage(), e);
    }
    // all readers should have the same number of implicit columns, add missing ones with value null
    mapWithMaxColumns = Maps.transformValues(mapWithMaxColumns, Functions.constant((String) null));
    for (Map<String, String> map : implicitColumns) {
        map.putAll(Maps.difference(map, mapWithMaxColumns).entriesOnlyOnRight());
    }
    // create an empty RecordReader to output the schema
    if (readers.size() == 0) {
        readers.add(new HiveDefaultReader(table, null, null, columns, context, conf, ImpersonationUtil.createProxyUgi(config.getUserName(), context.getQueryUserName())));
    }
    return new ScanBatch(config, context, oContext, readers.iterator(), implicitColumns);
}

Also used : ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) Matcher(java.util.regex.Matcher) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ProjectionPusher(org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) RecordReader(org.apache.drill.exec.store.RecordReader) AbstractRecordReader(org.apache.drill.exec.store.AbstractRecordReader) FileSplit(org.apache.hadoop.mapred.FileSplit) SchemaPath(org.apache.drill.common.expression.SchemaPath) OperatorContext(org.apache.drill.exec.ops.OperatorContext) FileSystem(org.apache.hadoop.fs.FileSystem) ScanBatch(org.apache.drill.exec.physical.impl.ScanBatch) HiveConf(org.apache.hadoop.hive.conf.HiveConf) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) Pattern(java.util.regex.Pattern) ParquetDirectByteBufferAllocator(org.apache.drill.exec.store.parquet.ParquetDirectByteBufferAllocator) IOException(java.io.IOException) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) Map(java.util.Map)

Example 28 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project asterixdb by apache.

the class DataflowTest method testHDFSReadWriteOperators.

/**
     * Test a job with only HDFS read and writes.
     *
     * @throws Exception
     */
public void testHDFSReadWriteOperators() throws Exception {
    FileInputFormat.setInputPaths(conf, HDFS_INPUT_PATH);
    FileOutputFormat.setOutputPath(conf, new Path(HDFS_OUTPUT_PATH));
    conf.setInputFormat(TextInputFormat.class);
    Scheduler scheduler = new Scheduler(HyracksUtils.CC_HOST, HyracksUtils.TEST_HYRACKS_CC_CLIENT_PORT);
    InputSplit[] splits = conf.getInputFormat().getSplits(conf, numberOfNC * 4);
    String[] readSchedule = scheduler.getLocationConstraints(splits);
    JobSpecification jobSpec = new JobSpecification();
    RecordDescriptor recordDesc = new RecordDescriptor(new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer() });
    String[] locations = new String[] { HyracksUtils.NC1_ID, HyracksUtils.NC1_ID, HyracksUtils.NC2_ID, HyracksUtils.NC2_ID };
    HDFSReadOperatorDescriptor readOperator = new HDFSReadOperatorDescriptor(jobSpec, recordDesc, conf, splits, readSchedule, new TextKeyValueParserFactory());
    PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, readOperator, locations);
    ExternalSortOperatorDescriptor sortOperator = new ExternalSortOperatorDescriptor(jobSpec, 10, new int[] { 0 }, new IBinaryComparatorFactory[] { RawBinaryComparatorFactory.INSTANCE }, recordDesc);
    PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, sortOperator, locations);
    HDFSWriteOperatorDescriptor writeOperator = new HDFSWriteOperatorDescriptor(jobSpec, conf, new TextTupleWriterFactory());
    PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, writeOperator, HyracksUtils.NC1_ID);
    jobSpec.connect(new OneToOneConnectorDescriptor(jobSpec), readOperator, 0, sortOperator, 0);
    jobSpec.connect(new MToNPartitioningMergingConnectorDescriptor(jobSpec, new FieldHashPartitionComputerFactory(new int[] { 0 }, new IBinaryHashFunctionFactory[] { RawBinaryHashFunctionFactory.INSTANCE }), new int[] { 0 }, new IBinaryComparatorFactory[] { RawBinaryComparatorFactory.INSTANCE }, null), sortOperator, 0, writeOperator, 0);
    jobSpec.addRoot(writeOperator);
    IHyracksClientConnection client = new HyracksConnection(HyracksUtils.CC_HOST, HyracksUtils.TEST_HYRACKS_CC_CLIENT_PORT);
    JobId jobId = client.startJob(jobSpec);
    client.waitForCompletion(jobId);
    Assert.assertEquals(true, checkResults());
}

Also used : Path(org.apache.hadoop.fs.Path) IHyracksClientConnection(org.apache.hyracks.api.client.IHyracksClientConnection) Scheduler(org.apache.hyracks.hdfs.scheduler.Scheduler) RecordDescriptor(org.apache.hyracks.api.dataflow.value.RecordDescriptor) MToNPartitioningMergingConnectorDescriptor(org.apache.hyracks.dataflow.std.connectors.MToNPartitioningMergingConnectorDescriptor) IBinaryComparatorFactory(org.apache.hyracks.api.dataflow.value.IBinaryComparatorFactory) OneToOneConnectorDescriptor(org.apache.hyracks.dataflow.std.connectors.OneToOneConnectorDescriptor) UTF8StringSerializerDeserializer(org.apache.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer) FieldHashPartitionComputerFactory(org.apache.hyracks.dataflow.common.data.partition.FieldHashPartitionComputerFactory) TextTupleWriterFactory(org.apache.hyracks.hdfs.lib.TextTupleWriterFactory) ExternalSortOperatorDescriptor(org.apache.hyracks.dataflow.std.sort.ExternalSortOperatorDescriptor) JobSpecification(org.apache.hyracks.api.job.JobSpecification) HyracksConnection(org.apache.hyracks.api.client.HyracksConnection) InputSplit(org.apache.hadoop.mapred.InputSplit) TextKeyValueParserFactory(org.apache.hyracks.hdfs.lib.TextKeyValueParserFactory) JobId(org.apache.hyracks.api.job.JobId)

Example 29 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project asterixdb by apache.

the class SchedulerTest method testSchedulercBoundary.

/**
     * Test boundary cases where splits array is empty or null
     *
     * @throws Exception
     */
public void testSchedulercBoundary() throws Exception {
    Map<String, NodeControllerInfo> ncNameToNcInfos = TestUtils.generateNodeControllerInfo(6, "nc", "10.0.0.", 5099, 5098, 5097);
    /** test empty file splits */
    InputSplit[] fileSplits = new InputSplit[0];
    String[] expectedResults = new String[] {};
    Scheduler scheduler = new Scheduler(ncNameToNcInfos);
    String[] locationConstraints = scheduler.getLocationConstraints(fileSplits);
    for (int i = 0; i < locationConstraints.length; i++) {
        Assert.assertEquals(locationConstraints[i], expectedResults[i]);
    }
    ClusterTopology topology = parseTopology();
    scheduler = new Scheduler(ncNameToNcInfos, topology);
    locationConstraints = scheduler.getLocationConstraints(fileSplits);
    for (int i = 0; i < locationConstraints.length; i++) {
        Assert.assertEquals(locationConstraints[i], expectedResults[i]);
    }
    fileSplits = null;
    expectedResults = new String[] {};
    scheduler = new Scheduler(ncNameToNcInfos);
    locationConstraints = scheduler.getLocationConstraints(fileSplits);
    for (int i = 0; i < locationConstraints.length; i++) {
        Assert.assertEquals(locationConstraints[i], expectedResults[i]);
    }
    scheduler = new Scheduler(ncNameToNcInfos, topology);
    locationConstraints = scheduler.getLocationConstraints(fileSplits);
    for (int i = 0; i < locationConstraints.length; i++) {
        Assert.assertEquals(locationConstraints[i], expectedResults[i]);
    }
}

Also used : NodeControllerInfo(org.apache.hyracks.api.client.NodeControllerInfo) ClusterTopology(org.apache.hyracks.api.topology.ClusterTopology) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 30 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project asterixdb by apache.

the class SchedulerTest method testSchedulerSmallerHDFS.

/**
     * Test the case where the HDFS cluster is a larger than the Hyracks cluster
     *
     * @throws Exception
     */
public void testSchedulerSmallerHDFS() throws Exception {
    Map<String, NodeControllerInfo> ncNameToNcInfos = TestUtils.generateNodeControllerInfo(6, "nc", "10.0.0.", 5099, 5098, 5097);
    InputSplit[] fileSplits = new InputSplit[12];
    fileSplits[0] = new FileSplit(new Path("part-1"), 0, 0, new String[] { "10.0.0.1", "10.0.0.2", "10.0.0.3" });
    fileSplits[1] = new FileSplit(new Path("part-2"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" });
    fileSplits[2] = new FileSplit(new Path("part-3"), 0, 0, new String[] { "10.0.0.4", "10.0.0.5", "10.0.0.3" });
    fileSplits[3] = new FileSplit(new Path("part-4"), 0, 0, new String[] { "10.0.0.2", "10.0.0.1", "10.0.0.3" });
    fileSplits[4] = new FileSplit(new Path("part-5"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" });
    fileSplits[5] = new FileSplit(new Path("part-6"), 0, 0, new String[] { "10.0.0.2", "10.0.0.3", "10.0.0.5" });
    fileSplits[6] = new FileSplit(new Path("part-7"), 0, 0, new String[] { "10.0.0.1", "10.0.0.2", "10.0.0.3" });
    fileSplits[7] = new FileSplit(new Path("part-8"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" });
    fileSplits[8] = new FileSplit(new Path("part-9"), 0, 0, new String[] { "10.0.0.4", "10.0.0.5", "10.0.0.1" });
    fileSplits[9] = new FileSplit(new Path("part-10"), 0, 0, new String[] { "10.0.0.2", "10.0.0.1", "10.0.0.2" });
    fileSplits[10] = new FileSplit(new Path("part-11"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" });
    fileSplits[11] = new FileSplit(new Path("part-12"), 0, 0, new String[] { "10.0.0.2", "10.0.0.3", "10.0.0.5" });
    String[] expectedResults = new String[] { "nc1", "nc4", "nc4", "nc1", "nc3", "nc2", "nc2", "nc3", "nc5", "nc6", "nc5", "nc6" };
    Scheduler scheduler = new Scheduler(ncNameToNcInfos);
    String[] locationConstraints = scheduler.getLocationConstraints(fileSplits);
    for (int i = 0; i < locationConstraints.length; i++) {
        Assert.assertEquals(locationConstraints[i], expectedResults[i]);
    }
    ClusterTopology topology = parseTopology();
    scheduler = new Scheduler(ncNameToNcInfos, topology);
    locationConstraints = scheduler.getLocationConstraints(fileSplits);
    for (int i = 0; i < locationConstraints.length; i++) {
        Assert.assertEquals(locationConstraints[i], expectedResults[i]);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) NodeControllerInfo(org.apache.hyracks.api.client.NodeControllerInfo) FileSplit(org.apache.hadoop.mapred.FileSplit) ClusterTopology(org.apache.hyracks.api.topology.ClusterTopology) InputSplit(org.apache.hadoop.mapred.InputSplit)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)161 Path (org.apache.hadoop.fs.Path)57 JobConf (org.apache.hadoop.mapred.JobConf)56 Test (org.junit.Test)49 IOException (java.io.IOException)47 ArrayList (java.util.ArrayList)29 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)27 FileSplit (org.apache.hadoop.mapred.FileSplit)24 FileSystem (org.apache.hadoop.fs.FileSystem)21 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)21 InputFormat (org.apache.hadoop.mapred.InputFormat)19 RecordWriter (org.apache.hadoop.mapred.RecordWriter)19 NullWritable (org.apache.hadoop.io.NullWritable)18 Text (org.apache.hadoop.io.Text)18 Configuration (org.apache.hadoop.conf.Configuration)14 LongWritable (org.apache.hadoop.io.LongWritable)11 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)10 Properties (java.util.Properties)9 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)9 HashMap (java.util.HashMap)8