use of org.apache.hadoop.mapred.FileSplit in project hive by apache.
the class SplitGrouper method schemaEvolved.
private boolean schemaEvolved(InputSplit s, InputSplit prevSplit, boolean groupAcrossFiles, MapWork work) throws IOException {
boolean retval = false;
Path path = ((FileSplit) s).getPath();
PartitionDesc pd = HiveFileFormatUtils.getFromPathRecursively(work.getPathToPartitionInfo(), path, cache);
String currentDeserializerClass = pd.getDeserializerClassName();
Class<?> currentInputFormatClass = pd.getInputFileFormatClass();
Class<?> previousInputFormatClass = null;
String previousDeserializerClass = null;
if (prevSplit != null) {
Path prevPath = ((FileSplit) prevSplit).getPath();
if (!groupAcrossFiles) {
return !path.equals(prevPath);
}
PartitionDesc prevPD = HiveFileFormatUtils.getFromPathRecursively(work.getPathToPartitionInfo(), prevPath, cache);
previousDeserializerClass = prevPD.getDeserializerClassName();
previousInputFormatClass = prevPD.getInputFileFormatClass();
}
if ((currentInputFormatClass != previousInputFormatClass) || (!currentDeserializerClass.equals(previousDeserializerClass))) {
retval = true;
}
if (LOG.isDebugEnabled()) {
LOG.debug("Adding split " + path + " to src new group? " + retval);
}
return retval;
}
use of org.apache.hadoop.mapred.FileSplit in project hive by apache.
the class SerDeEncodedDataReader method readFileWithCache.
public Boolean readFileWithCache(long startTime) throws IOException, InterruptedException {
if (fileKey == null)
return false;
BooleanRef gotAllData = new BooleanRef();
long endOfSplit = split.getStart() + split.getLength();
this.cachedData = cache.getFileData(fileKey, split.getStart(), endOfSplit, writerIncludes, CC_FACTORY, counters, gotAllData);
if (cachedData == null) {
if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
LlapIoImpl.CACHE_LOGGER.trace("No data for the split found in cache");
}
return false;
}
String[] hosts = extractHosts(split, false), inMemoryHosts = extractHosts(split, true);
List<StripeData> slices = cachedData.getData();
if (slices.isEmpty())
return false;
long uncachedPrefixEnd = slices.get(0).getKnownTornStart(), uncachedSuffixStart = slices.get(slices.size() - 1).getLastEnd();
Ref<Integer> stripeIx = Ref.from(0);
if (uncachedPrefixEnd > split.getStart()) {
// TODO: can we merge neighboring splits? So we don't init so many readers.
FileSplit sliceSplit = new FileSplit(split.getPath(), split.getStart(), uncachedPrefixEnd - split.getStart(), hosts, inMemoryHosts);
if (!processOneFileSplit(sliceSplit, startTime, stripeIx, null))
return null;
}
while (!slices.isEmpty()) {
StripeData slice = slices.get(0);
long start = slice.getKnownTornStart();
// Will also read the last row.
long len = slice.getLastStart() - start;
FileSplit sliceSplit = new FileSplit(split.getPath(), start, len, hosts, inMemoryHosts);
if (!processOneFileSplit(sliceSplit, startTime, stripeIx, slice))
return null;
}
boolean isUnfortunate = false;
if (uncachedSuffixStart == endOfSplit) {
// This is rather obscure. The end of last row cached is precisely at the split end offset.
// If the split is in the middle of the file, LRR would read one more row after that,
// therefore as unfortunate as it is, we have to do a one-row read. However, for that to
// have happened, someone should have supplied a split that ends inside the last row, i.e.
// a few bytes earlier than the current split, which is pretty unlikely. What is more likely
// is that the split, and the last row, both end at the end of file. Check for this.
long size = split.getPath().getFileSystem(daemonConf).getFileStatus(split.getPath()).getLen();
isUnfortunate = size > endOfSplit;
if (isUnfortunate) {
// Log at warn, given how unfortunate this is.
LlapIoImpl.LOG.warn("One-row mismatch at the end of split " + split.getPath() + " at " + endOfSplit + "; file size is " + size);
}
}
if (uncachedSuffixStart < endOfSplit || isUnfortunate) {
// Note: we assume 0-length split is correct given now LRR interprets offsets (reading an
// extra row). Should we instead assume 1+ chars and add 1 for isUnfortunate?
FileSplit splitPart = new FileSplit(split.getPath(), uncachedSuffixStart, endOfSplit - uncachedSuffixStart, hosts, inMemoryHosts);
if (!processOneFileSplit(splitPart, startTime, stripeIx, null))
return null;
}
return true;
}
use of org.apache.hadoop.mapred.FileSplit in project hive by apache.
the class VectorizedColumnReaderTestBase method getFileSplit.
protected static FileSplit getFileSplit(Job vectorJob) throws IOException, InterruptedException {
ParquetInputFormat parquetInputFormat = new ParquetInputFormat(GroupReadSupport.class);
InputSplit split = (InputSplit) parquetInputFormat.getSplits(vectorJob).get(0);
FileSplit fsplit = new FileSplit(file, 0L, split.getLength(), split.getLocations());
return fsplit;
}
use of org.apache.hadoop.mapred.FileSplit in project hive by apache.
the class AbstractTestParquetDirect method read.
public static List<ArrayWritable> read(Path parquetFile) throws IOException {
List<ArrayWritable> records = new ArrayList<ArrayWritable>();
RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(new FileSplit(parquetFile, 0, fileLength(parquetFile), (String[]) null), new JobConf(), null);
NullWritable alwaysNull = reader.createKey();
ArrayWritable record = reader.createValue();
while (reader.next(alwaysNull, record)) {
records.add(record);
// a new value so the last isn't clobbered
record = reader.createValue();
}
return records;
}
use of org.apache.hadoop.mapred.FileSplit in project incubator-systemml by apache.
the class RemoteParForColocatedNLineInputFormat method getSplits.
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
InputSplit[] tmp = super.getSplits(job, numSplits);
// get partitioning information
MatrixCharacteristics mc = MRJobConfiguration.getPartitionedMatrixSize(job);
PDataPartitionFormat dpf = MRJobConfiguration.getPartitioningFormat(job);
PartitionFormat pf = new PartitionFormat(dpf, -1);
int blen = (int) (pf.isRowwise() ? pf.getNumRows(mc) : pf.getNumColumns(mc));
String fname = MRJobConfiguration.getPartitioningFilename(job);
// create wrapper splits
InputSplit[] ret = new InputSplit[tmp.length];
for (int i = 0; i < tmp.length; i++) {
// check for robustness of subsequent cast
if (tmp[i] instanceof FileSplit)
ret[i] = new RemoteParForColocatedFileSplit((FileSplit) tmp[i], fname, blen);
else
ret[i] = tmp[i];
}
return ret;
}
Aggregations