Search in sources :

Example 31 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project incubator-systemml by apache.

the class DelegatingInputFormat method getRecordReader.

@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException {
    // Find the InputFormat and then the RecordReader from the
    // TaggedInputSplit.
    TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
    InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils.newInstance(taggedInputSplit.getInputFormatClass(), conf);
    InputSplit inputSplit = taggedInputSplit.getInputSplit();
    if (inputSplit instanceof FileSplit) {
        FileSplit fileSplit = (FileSplit) inputSplit;
        conf.set(MRConfigurationNames.MR_MAP_INPUT_FILE, fileSplit.getPath().toString());
        conf.setLong(MRConfigurationNames.MR_MAP_INPUT_START, fileSplit.getStart());
        conf.setLong(MRConfigurationNames.MR_MAP_INPUT_LENGTH, fileSplit.getLength());
    }
    return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf, reporter);
}
Also used : FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 32 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class CustomPartitionVertex method getBucketSplitMapForPath.

/*
   * This method generates the map of bucket to file splits.
   */
private Multimap<Integer, InputSplit> getBucketSplitMapForPath(Map<String, Set<FileSplit>> pathFileSplitsMap) {
    int bucketNum = 0;
    Multimap<Integer, InputSplit> bucketToInitialSplitMap = ArrayListMultimap.<Integer, InputSplit>create();
    for (Map.Entry<String, Set<FileSplit>> entry : pathFileSplitsMap.entrySet()) {
        int bucketId = bucketNum % numBuckets;
        for (FileSplit fsplit : entry.getValue()) {
            bucketToInitialSplitMap.put(bucketId, fsplit);
        }
        bucketNum++;
    }
    // well.
    if (bucketNum < numBuckets) {
        int loopedBucketId = 0;
        for (; bucketNum < numBuckets; bucketNum++) {
            for (InputSplit fsplit : bucketToInitialSplitMap.get(loopedBucketId)) {
                bucketToInitialSplitMap.put(bucketNum, fsplit);
            }
            loopedBucketId++;
        }
    }
    return bucketToInitialSplitMap;
}
Also used : Set(java.util.Set) TreeSet(java.util.TreeSet) ByteString(com.google.protobuf.ByteString) FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) Map(java.util.Map) HashMap(java.util.HashMap) TreeMap(java.util.TreeMap) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint)

Example 33 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class StatsNoJobTask method aggregateStats.

private int aggregateStats(ExecutorService threadPool, Hive db) {
    int ret = 0;
    try {
        Collection<Partition> partitions = null;
        if (work.getPrunedPartitionList() == null) {
            partitions = getPartitionsList();
        } else {
            partitions = work.getPrunedPartitionList().getPartitions();
        }
        // non-partitioned table
        if (partitions == null) {
            org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
            Map<String, String> parameters = tTable.getParameters();
            try {
                Path dir = new Path(tTable.getSd().getLocation());
                long numRows = 0;
                long rawDataSize = 0;
                long fileSize = 0;
                long numFiles = 0;
                FileSystem fs = dir.getFileSystem(conf);
                FileStatus[] fileList = HiveStatsUtils.getFileStatusRecurse(dir, -1, fs);
                boolean statsAvailable = false;
                for (FileStatus file : fileList) {
                    if (!file.isDir()) {
                        InputFormat<?, ?> inputFormat = ReflectionUtil.newInstance(table.getInputFormatClass(), jc);
                        InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[] { table.getDataLocation().toString() });
                        if (file.getLen() == 0) {
                            numFiles += 1;
                            statsAvailable = true;
                        } else {
                            org.apache.hadoop.mapred.RecordReader<?, ?> recordReader = inputFormat.getRecordReader(dummySplit, jc, Reporter.NULL);
                            StatsProvidingRecordReader statsRR;
                            if (recordReader instanceof StatsProvidingRecordReader) {
                                statsRR = (StatsProvidingRecordReader) recordReader;
                                numRows += statsRR.getStats().getRowCount();
                                rawDataSize += statsRR.getStats().getRawDataSize();
                                fileSize += file.getLen();
                                numFiles += 1;
                                statsAvailable = true;
                            }
                            recordReader.close();
                        }
                    }
                }
                if (statsAvailable) {
                    parameters.put(StatsSetupConst.ROW_COUNT, String.valueOf(numRows));
                    parameters.put(StatsSetupConst.RAW_DATA_SIZE, String.valueOf(rawDataSize));
                    parameters.put(StatsSetupConst.TOTAL_SIZE, String.valueOf(fileSize));
                    parameters.put(StatsSetupConst.NUM_FILES, String.valueOf(numFiles));
                    EnvironmentContext environmentContext = new EnvironmentContext();
                    environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, StatsSetupConst.TASK);
                    db.alterTable(tableFullName, new Table(tTable), environmentContext);
                    String msg = "Table " + tableFullName + " stats: [" + toString(parameters) + ']';
                    LOG.debug(msg);
                    console.printInfo(msg);
                } else {
                    String msg = "Table " + tableFullName + " does not provide stats.";
                    LOG.debug(msg);
                }
            } catch (Exception e) {
                console.printInfo("[Warning] could not update stats for " + tableFullName + ".", "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
            }
        } else {
            // Partitioned table
            for (Partition partn : partitions) {
                threadPool.execute(new StatsCollection(partn));
            }
            LOG.debug("Stats collection waiting for threadpool to shutdown..");
            shutdownAndAwaitTermination(threadPool);
            LOG.debug("Stats collection threadpool shutdown successful.");
            ret = updatePartitions(db);
        }
    } catch (Exception e) {
        // Fail the query if the stats are supposed to be reliable
        if (work.isStatsReliable()) {
            ret = -1;
        }
    }
    // anything else indicates failure
    return ret;
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) StatsProvidingRecordReader(org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader) FileSplit(org.apache.hadoop.mapred.FileSplit) EnvironmentContext(org.apache.hadoop.hive.metastore.api.EnvironmentContext) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapred.InputSplit) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) Table(org.apache.hadoop.hive.ql.metadata.Table) InvalidOperationException(org.apache.hadoop.hive.metastore.api.InvalidOperationException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Example 34 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class VectorizedOrcInputFormat method getRecordReader.

@Override
public RecordReader<NullWritable, VectorizedRowBatch> getRecordReader(InputSplit inputSplit, JobConf conf, Reporter reporter) throws IOException {
    FileSplit fSplit = (FileSplit) inputSplit;
    reporter.setStatus(fSplit.toString());
    Path path = fSplit.getPath();
    OrcFile.ReaderOptions opts = OrcFile.readerOptions(conf);
    if (fSplit instanceof OrcSplit) {
        OrcSplit orcSplit = (OrcSplit) fSplit;
        if (orcSplit.hasFooter()) {
            opts.orcTail(orcSplit.getOrcTail());
        }
        opts.maxLength(orcSplit.getFileLength());
    }
    Reader reader = OrcFile.createReader(path, opts);
    return new VectorizedOrcRecordReader(reader, conf, fSplit);
}
Also used : Path(org.apache.hadoop.fs.Path) RecordReader(org.apache.hadoop.mapred.RecordReader) FileSplit(org.apache.hadoop.mapred.FileSplit)

Example 35 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class PTFRowContainer method first.

@Override
public Row first() throws HiveException {
    Row r = super.first();
    if (blockInfos.size() > 0) {
        InputSplit[] inputSplits = getInputSplits();
        FileSplit fS = null;
        BlockInfo bI = blockInfos.get(0);
        bI.startingSplit = 0;
        int i = 1;
        bI = i < blockInfos.size() ? blockInfos.get(i) : null;
        for (int j = 1; j < inputSplits.length && bI != null; j++) {
            fS = (FileSplit) inputSplits[j];
            while (bI != null && bI.startOffset < fS.getStart()) {
                bI.startingSplit = j - 1;
                i++;
                bI = i < blockInfos.size() ? blockInfos.get(i) : null;
            }
        }
        while (i < blockInfos.size()) {
            bI = blockInfos.get(i);
            bI.startingSplit = inputSplits.length - 1;
            i++;
        }
    }
    currentReadBlockStartRow = 0;
    return r;
}
Also used : FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit)

Aggregations

FileSplit (org.apache.hadoop.mapred.FileSplit)66 Path (org.apache.hadoop.fs.Path)38 InputSplit (org.apache.hadoop.mapred.InputSplit)23 JobConf (org.apache.hadoop.mapred.JobConf)16 File (java.io.File)10 IOException (java.io.IOException)10 Configuration (org.apache.hadoop.conf.Configuration)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 FileSystem (org.apache.hadoop.fs.FileSystem)10 Test (org.junit.Test)9 RecordReader (org.apache.hadoop.mapred.RecordReader)8 ArrayList (java.util.ArrayList)7 Properties (java.util.Properties)7 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)7 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)5 NullWritable (org.apache.hadoop.io.NullWritable)5 InputFormat (org.apache.hadoop.mapred.InputFormat)4 NodeControllerInfo (org.apache.hyracks.api.client.NodeControllerInfo)4 ClusterTopology (org.apache.hyracks.api.topology.ClusterTopology)4 VertexLocationHint (org.apache.tez.dag.api.VertexLocationHint)4