Search in sources :

Example 26 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class SplitGrouper method schemaEvolved.

private boolean schemaEvolved(InputSplit s, InputSplit prevSplit, boolean groupAcrossFiles, MapWork work) throws IOException {
    boolean retval = false;
    Path path = ((FileSplit) s).getPath();
    PartitionDesc pd = HiveFileFormatUtils.getFromPathRecursively(work.getPathToPartitionInfo(), path, cache);
    String currentDeserializerClass = pd.getDeserializerClassName();
    Class<?> currentInputFormatClass = pd.getInputFileFormatClass();
    Class<?> previousInputFormatClass = null;
    String previousDeserializerClass = null;
    if (prevSplit != null) {
        Path prevPath = ((FileSplit) prevSplit).getPath();
        if (!groupAcrossFiles) {
            return !path.equals(prevPath);
        }
        PartitionDesc prevPD = HiveFileFormatUtils.getFromPathRecursively(work.getPathToPartitionInfo(), prevPath, cache);
        previousDeserializerClass = prevPD.getDeserializerClassName();
        previousInputFormatClass = prevPD.getInputFileFormatClass();
    }
    if ((currentInputFormatClass != previousInputFormatClass) || (!currentDeserializerClass.equals(previousDeserializerClass))) {
        retval = true;
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Adding split " + path + " to src new group? " + retval);
    }
    return retval;
}
Also used : Path(org.apache.hadoop.fs.Path) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) FileSplit(org.apache.hadoop.mapred.FileSplit)

Example 27 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class SerDeEncodedDataReader method readFileWithCache.

public Boolean readFileWithCache(long startTime) throws IOException, InterruptedException {
    if (fileKey == null)
        return false;
    BooleanRef gotAllData = new BooleanRef();
    long endOfSplit = split.getStart() + split.getLength();
    this.cachedData = cache.getFileData(fileKey, split.getStart(), endOfSplit, writerIncludes, CC_FACTORY, counters, gotAllData);
    if (cachedData == null) {
        if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
            LlapIoImpl.CACHE_LOGGER.trace("No data for the split found in cache");
        }
        return false;
    }
    String[] hosts = extractHosts(split, false), inMemoryHosts = extractHosts(split, true);
    List<StripeData> slices = cachedData.getData();
    if (slices.isEmpty())
        return false;
    long uncachedPrefixEnd = slices.get(0).getKnownTornStart(), uncachedSuffixStart = slices.get(slices.size() - 1).getLastEnd();
    Ref<Integer> stripeIx = Ref.from(0);
    if (uncachedPrefixEnd > split.getStart()) {
        // TODO: can we merge neighboring splits? So we don't init so many readers.
        FileSplit sliceSplit = new FileSplit(split.getPath(), split.getStart(), uncachedPrefixEnd - split.getStart(), hosts, inMemoryHosts);
        if (!processOneFileSplit(sliceSplit, startTime, stripeIx, null))
            return null;
    }
    while (!slices.isEmpty()) {
        StripeData slice = slices.get(0);
        long start = slice.getKnownTornStart();
        // Will also read the last row.
        long len = slice.getLastStart() - start;
        FileSplit sliceSplit = new FileSplit(split.getPath(), start, len, hosts, inMemoryHosts);
        if (!processOneFileSplit(sliceSplit, startTime, stripeIx, slice))
            return null;
    }
    boolean isUnfortunate = false;
    if (uncachedSuffixStart == endOfSplit) {
        // This is rather obscure. The end of last row cached is precisely at the split end offset.
        // If the split is in the middle of the file, LRR would read one more row after that,
        // therefore as unfortunate as it is, we have to do a one-row read. However, for that to
        // have happened, someone should have supplied a split that ends inside the last row, i.e.
        // a few bytes earlier than the current split, which is pretty unlikely. What is more likely
        // is that the split, and the last row, both end at the end of file. Check for this.
        long size = split.getPath().getFileSystem(daemonConf).getFileStatus(split.getPath()).getLen();
        isUnfortunate = size > endOfSplit;
        if (isUnfortunate) {
            // Log at warn, given how unfortunate this is.
            LlapIoImpl.LOG.warn("One-row mismatch at the end of split " + split.getPath() + " at " + endOfSplit + "; file size is " + size);
        }
    }
    if (uncachedSuffixStart < endOfSplit || isUnfortunate) {
        // Note: we assume 0-length split is correct given now LRR interprets offsets (reading an
        // extra row). Should we instead assume 1+ chars and add 1 for isUnfortunate?
        FileSplit splitPart = new FileSplit(split.getPath(), uncachedSuffixStart, endOfSplit - uncachedSuffixStart, hosts, inMemoryHosts);
        if (!processOneFileSplit(splitPart, startTime, stripeIx, null))
            return null;
    }
    return true;
}
Also used : StripeData(org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData) BooleanRef(org.apache.hadoop.hive.common.io.DataCache.BooleanRef) FileSplit(org.apache.hadoop.mapred.FileSplit)

Example 28 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class VectorizedColumnReaderTestBase method getFileSplit.

protected static FileSplit getFileSplit(Job vectorJob) throws IOException, InterruptedException {
    ParquetInputFormat parquetInputFormat = new ParquetInputFormat(GroupReadSupport.class);
    InputSplit split = (InputSplit) parquetInputFormat.getSplits(vectorJob).get(0);
    FileSplit fsplit = new FileSplit(file, 0L, split.getLength(), split.getLocations());
    return fsplit;
}
Also used : ParquetInputFormat(org.apache.parquet.hadoop.ParquetInputFormat) FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 29 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class AbstractTestParquetDirect method read.

public static List<ArrayWritable> read(Path parquetFile) throws IOException {
    List<ArrayWritable> records = new ArrayList<ArrayWritable>();
    RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(new FileSplit(parquetFile, 0, fileLength(parquetFile), (String[]) null), new JobConf(), null);
    NullWritable alwaysNull = reader.createKey();
    ArrayWritable record = reader.createValue();
    while (reader.next(alwaysNull, record)) {
        records.add(record);
        // a new value so the last isn't clobbered
        record = reader.createValue();
    }
    return records;
}
Also used : ArrayWritable(org.apache.hadoop.io.ArrayWritable) ArrayList(java.util.ArrayList) FileSplit(org.apache.hadoop.mapred.FileSplit) NullWritable(org.apache.hadoop.io.NullWritable) JobConf(org.apache.hadoop.mapred.JobConf)

Example 30 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project incubator-systemml by apache.

the class RemoteParForColocatedNLineInputFormat method getSplits.

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    InputSplit[] tmp = super.getSplits(job, numSplits);
    // get partitioning information
    MatrixCharacteristics mc = MRJobConfiguration.getPartitionedMatrixSize(job);
    PDataPartitionFormat dpf = MRJobConfiguration.getPartitioningFormat(job);
    PartitionFormat pf = new PartitionFormat(dpf, -1);
    int blen = (int) (pf.isRowwise() ? pf.getNumRows(mc) : pf.getNumColumns(mc));
    String fname = MRJobConfiguration.getPartitioningFilename(job);
    // create wrapper splits
    InputSplit[] ret = new InputSplit[tmp.length];
    for (int i = 0; i < tmp.length; i++) {
        // check for robustness of subsequent cast
        if (tmp[i] instanceof FileSplit)
            ret[i] = new RemoteParForColocatedFileSplit((FileSplit) tmp[i], fname, blen);
        else
            ret[i] = tmp[i];
    }
    return ret;
}
Also used : PDataPartitionFormat(org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat) PDataPartitionFormat(org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat) PartitionFormat(org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PartitionFormat) FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Aggregations

FileSplit (org.apache.hadoop.mapred.FileSplit)66 Path (org.apache.hadoop.fs.Path)38 InputSplit (org.apache.hadoop.mapred.InputSplit)23 JobConf (org.apache.hadoop.mapred.JobConf)16 File (java.io.File)10 IOException (java.io.IOException)10 Configuration (org.apache.hadoop.conf.Configuration)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 FileSystem (org.apache.hadoop.fs.FileSystem)10 Test (org.junit.Test)9 RecordReader (org.apache.hadoop.mapred.RecordReader)8 ArrayList (java.util.ArrayList)7 Properties (java.util.Properties)7 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)7 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)5 NullWritable (org.apache.hadoop.io.NullWritable)5 InputFormat (org.apache.hadoop.mapred.InputFormat)4 NodeControllerInfo (org.apache.hyracks.api.client.NodeControllerInfo)4 ClusterTopology (org.apache.hyracks.api.topology.ClusterTopology)4 VertexLocationHint (org.apache.tez.dag.api.VertexLocationHint)4