use of org.apache.hadoop.mapred.FileSplit in project incubator-systemml by apache.
the class DelegatingInputFormat method getRecordReader.
@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException {
// Find the InputFormat and then the RecordReader from the
// TaggedInputSplit.
TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils.newInstance(taggedInputSplit.getInputFormatClass(), conf);
InputSplit inputSplit = taggedInputSplit.getInputSplit();
if (inputSplit instanceof FileSplit) {
FileSplit fileSplit = (FileSplit) inputSplit;
conf.set(MRConfigurationNames.MR_MAP_INPUT_FILE, fileSplit.getPath().toString());
conf.setLong(MRConfigurationNames.MR_MAP_INPUT_START, fileSplit.getStart());
conf.setLong(MRConfigurationNames.MR_MAP_INPUT_LENGTH, fileSplit.getLength());
}
return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf, reporter);
}
use of org.apache.hadoop.mapred.FileSplit in project hive by apache.
the class CustomPartitionVertex method getBucketSplitMapForPath.
/*
* This method generates the map of bucket to file splits.
*/
private Multimap<Integer, InputSplit> getBucketSplitMapForPath(Map<String, Set<FileSplit>> pathFileSplitsMap) {
int bucketNum = 0;
Multimap<Integer, InputSplit> bucketToInitialSplitMap = ArrayListMultimap.<Integer, InputSplit>create();
for (Map.Entry<String, Set<FileSplit>> entry : pathFileSplitsMap.entrySet()) {
int bucketId = bucketNum % numBuckets;
for (FileSplit fsplit : entry.getValue()) {
bucketToInitialSplitMap.put(bucketId, fsplit);
}
bucketNum++;
}
// well.
if (bucketNum < numBuckets) {
int loopedBucketId = 0;
for (; bucketNum < numBuckets; bucketNum++) {
for (InputSplit fsplit : bucketToInitialSplitMap.get(loopedBucketId)) {
bucketToInitialSplitMap.put(bucketNum, fsplit);
}
loopedBucketId++;
}
}
return bucketToInitialSplitMap;
}
use of org.apache.hadoop.mapred.FileSplit in project hive by apache.
the class StatsNoJobTask method aggregateStats.
private int aggregateStats(ExecutorService threadPool, Hive db) {
int ret = 0;
try {
Collection<Partition> partitions = null;
if (work.getPrunedPartitionList() == null) {
partitions = getPartitionsList();
} else {
partitions = work.getPrunedPartitionList().getPartitions();
}
// non-partitioned table
if (partitions == null) {
org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
Map<String, String> parameters = tTable.getParameters();
try {
Path dir = new Path(tTable.getSd().getLocation());
long numRows = 0;
long rawDataSize = 0;
long fileSize = 0;
long numFiles = 0;
FileSystem fs = dir.getFileSystem(conf);
FileStatus[] fileList = HiveStatsUtils.getFileStatusRecurse(dir, -1, fs);
boolean statsAvailable = false;
for (FileStatus file : fileList) {
if (!file.isDir()) {
InputFormat<?, ?> inputFormat = ReflectionUtil.newInstance(table.getInputFormatClass(), jc);
InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[] { table.getDataLocation().toString() });
if (file.getLen() == 0) {
numFiles += 1;
statsAvailable = true;
} else {
org.apache.hadoop.mapred.RecordReader<?, ?> recordReader = inputFormat.getRecordReader(dummySplit, jc, Reporter.NULL);
StatsProvidingRecordReader statsRR;
if (recordReader instanceof StatsProvidingRecordReader) {
statsRR = (StatsProvidingRecordReader) recordReader;
numRows += statsRR.getStats().getRowCount();
rawDataSize += statsRR.getStats().getRawDataSize();
fileSize += file.getLen();
numFiles += 1;
statsAvailable = true;
}
recordReader.close();
}
}
}
if (statsAvailable) {
parameters.put(StatsSetupConst.ROW_COUNT, String.valueOf(numRows));
parameters.put(StatsSetupConst.RAW_DATA_SIZE, String.valueOf(rawDataSize));
parameters.put(StatsSetupConst.TOTAL_SIZE, String.valueOf(fileSize));
parameters.put(StatsSetupConst.NUM_FILES, String.valueOf(numFiles));
EnvironmentContext environmentContext = new EnvironmentContext();
environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, StatsSetupConst.TASK);
db.alterTable(tableFullName, new Table(tTable), environmentContext);
String msg = "Table " + tableFullName + " stats: [" + toString(parameters) + ']';
LOG.debug(msg);
console.printInfo(msg);
} else {
String msg = "Table " + tableFullName + " does not provide stats.";
LOG.debug(msg);
}
} catch (Exception e) {
console.printInfo("[Warning] could not update stats for " + tableFullName + ".", "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
}
} else {
// Partitioned table
for (Partition partn : partitions) {
threadPool.execute(new StatsCollection(partn));
}
LOG.debug("Stats collection waiting for threadpool to shutdown..");
shutdownAndAwaitTermination(threadPool);
LOG.debug("Stats collection threadpool shutdown successful.");
ret = updatePartitions(db);
}
} catch (Exception e) {
// Fail the query if the stats are supposed to be reliable
if (work.isStatsReliable()) {
ret = -1;
}
}
// anything else indicates failure
return ret;
}
use of org.apache.hadoop.mapred.FileSplit in project hive by apache.
the class VectorizedOrcInputFormat method getRecordReader.
@Override
public RecordReader<NullWritable, VectorizedRowBatch> getRecordReader(InputSplit inputSplit, JobConf conf, Reporter reporter) throws IOException {
FileSplit fSplit = (FileSplit) inputSplit;
reporter.setStatus(fSplit.toString());
Path path = fSplit.getPath();
OrcFile.ReaderOptions opts = OrcFile.readerOptions(conf);
if (fSplit instanceof OrcSplit) {
OrcSplit orcSplit = (OrcSplit) fSplit;
if (orcSplit.hasFooter()) {
opts.orcTail(orcSplit.getOrcTail());
}
opts.maxLength(orcSplit.getFileLength());
}
Reader reader = OrcFile.createReader(path, opts);
return new VectorizedOrcRecordReader(reader, conf, fSplit);
}
use of org.apache.hadoop.mapred.FileSplit in project hive by apache.
the class PTFRowContainer method first.
@Override
public Row first() throws HiveException {
Row r = super.first();
if (blockInfos.size() > 0) {
InputSplit[] inputSplits = getInputSplits();
FileSplit fS = null;
BlockInfo bI = blockInfos.get(0);
bI.startingSplit = 0;
int i = 1;
bI = i < blockInfos.size() ? blockInfos.get(i) : null;
for (int j = 1; j < inputSplits.length && bI != null; j++) {
fS = (FileSplit) inputSplits[j];
while (bI != null && bI.startOffset < fS.getStart()) {
bI.startingSplit = j - 1;
i++;
bI = i < blockInfos.size() ? blockInfos.get(i) : null;
}
}
while (i < blockInfos.size()) {
bI = blockInfos.get(i);
bI.startingSplit = inputSplits.length - 1;
i++;
}
}
currentReadBlockStartRow = 0;
return r;
}
Aggregations