use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project druid by druid-io.
the class DruidParquetInputTest method getFirstRecord.
private GenericRecord getFirstRecord(Job job, String parquetPath) throws IOException, InterruptedException {
File testFile = new File(parquetPath);
Path path = new Path(testFile.getAbsoluteFile().toURI());
FileSplit split = new FileSplit(path, 0, testFile.length(), null);
DruidParquetInputFormat inputFormat = ReflectionUtils.newInstance(DruidParquetInputFormat.class, job.getConfiguration());
TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
RecordReader reader = inputFormat.createRecordReader(split, context);
reader.initialize(split, context);
reader.nextKeyValue();
GenericRecord data = (GenericRecord) reader.getCurrentValue();
reader.close();
return data;
}
use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project pinot by linkedin.
the class DelegatingAvroKeyInputFormat method createRecordReader.
public org.apache.hadoop.mapreduce.RecordReader<org.apache.avro.mapred.AvroKey<T>, NullWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
LOGGER.info("DelegatingAvroKeyInputFormat.createRecordReader() for split:{}", split);
FileSplit fileSplit = (FileSplit) split;
Configuration configuration = context.getConfiguration();
String sourceName = getSourceNameFromPath(fileSplit, configuration);
LOGGER.info("Source Name for path {} : {}", fileSplit.getPath(), sourceName);
Map<String, String> schemaJSONMapping = new ObjectMapper().readValue(configuration.get("schema.json.mapping"), MAP_STRING_STRING_TYPE);
LOGGER.info("Schema JSON Mapping: {}", schemaJSONMapping);
String sourceSchemaJSON = schemaJSONMapping.get(sourceName);
Schema schema = new Schema.Parser().parse(sourceSchemaJSON);
return new AvroKeyRecordReader<T>(schema);
}
use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hadoop by apache.
the class TeraScheduler method getNewFileSplits.
/**
* Solve the schedule and modify the FileSplit array to reflect the new
* schedule. It will move placed splits to front and unplacable splits
* to the end.
* @return a new list of FileSplits that are modified to have the
* best host as the only host.
* @throws IOException
*/
public List<InputSplit> getNewFileSplits() throws IOException {
solve();
FileSplit[] result = new FileSplit[realSplits.length];
int left = 0;
int right = realSplits.length - 1;
for (int i = 0; i < splits.length; ++i) {
if (splits[i].isAssigned) {
// copy the split and fix up the locations
String[] newLocations = { splits[i].locations.get(0).hostname };
realSplits[i] = new FileSplit(realSplits[i].getPath(), realSplits[i].getStart(), realSplits[i].getLength(), newLocations);
result[left++] = realSplits[i];
} else {
result[right--] = realSplits[i];
}
}
List<InputSplit> ret = new ArrayList<InputSplit>();
for (FileSplit fs : result) {
ret.add(fs);
}
return ret;
}
use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hadoop by apache.
the class UniformSizeInputFormat method getSplits.
private List<InputSplit> getSplits(Configuration configuration, int numSplits, long totalSizeBytes) throws IOException {
List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
long nBytesPerSplit = (long) Math.ceil(totalSizeBytes * 1.0 / numSplits);
CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
Text srcRelPath = new Text();
long currentSplitSize = 0;
long lastSplitStart = 0;
long lastPosition = 0;
final Path listingFilePath = getListingFilePath(configuration);
if (LOG.isDebugEnabled()) {
LOG.debug("Average bytes per map: " + nBytesPerSplit + ", Number of maps: " + numSplits + ", total size: " + totalSizeBytes);
}
SequenceFile.Reader reader = null;
try {
reader = getListingFileReader(configuration);
while (reader.next(srcRelPath, srcFileStatus)) {
// limit. Add the current file to new split
if (currentSplitSize + srcFileStatus.getLen() > nBytesPerSplit && lastPosition != 0) {
FileSplit split = new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null);
if (LOG.isDebugEnabled()) {
LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize);
}
splits.add(split);
lastSplitStart = lastPosition;
currentSplitSize = 0;
}
currentSplitSize += srcFileStatus.getLen();
lastPosition = reader.getPosition();
}
if (lastPosition > lastSplitStart) {
FileSplit split = new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null);
if (LOG.isDebugEnabled()) {
LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize);
}
splits.add(split);
}
} finally {
IOUtils.closeStream(reader);
}
return splits;
}
use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hadoop by apache.
the class DynamicInputChunk method openForRead.
private void openForRead(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
reader = new SequenceFileRecordReader<K, V>();
reader.initialize(new FileSplit(chunkFilePath, 0, DistCpUtils.getFileSize(chunkFilePath, chunkContext.getConfiguration()), null), taskAttemptContext);
}
Aggregations