use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project jena by apache.
the class AbstractWholeFileNodeTupleReader method initialize.
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
LOG.debug("initialize({}, {})", genericSplit, context);
// Assuming file split
if (!(genericSplit instanceof FileSplit))
throw new IOException("This record reader only supports FileSplit inputs");
FileSplit split = (FileSplit) genericSplit;
// Configuration
Configuration config = context.getConfiguration();
this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true);
if (this.ignoreBadTuples)
LOG.warn("Configured to ignore bad tuples, parsing errors will be logged and further parsing aborted but no user visible errors will be thrown. Consider setting {} to false to disable this behaviour", RdfIOConstants.INPUT_IGNORE_BAD_TUPLES);
// Figure out what portion of the file to read
if (split.getStart() > 0)
throw new IOException("This record reader requires a file split which covers the entire file");
final Path file = split.getPath();
long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen();
CompressionCodecFactory factory = new CompressionCodecFactory(config);
this.compressionCodecs = factory.getCodec(file);
LOG.info(String.format("Got split with start %d and length %d for file with total length of %d", new Object[] { split.getStart(), split.getLength(), totalLength }));
if (totalLength > split.getLength())
throw new IOException("This record reader requires a file split which covers the entire file");
// Open the file and prepare the input stream
FileSystem fs = file.getFileSystem(config);
FSDataInputStream fileIn = fs.open(file);
this.length = split.getLength();
if (this.compressionCodecs != null) {
// Compressed input
input = new TrackedInputStream(this.compressionCodecs.createInputStream(fileIn));
} else {
// Uncompressed input
input = new TrackedInputStream(fileIn);
}
// Set up background thread for parser
iter = this.getPipedIterator();
this.stream = this.getPipedStream(iter, this.input);
RDFParserBuilder builder = RdfIOUtils.createRDFParserBuilder(context, file);
Runnable parserRunnable = this.createRunnable(this, this.input, stream, this.getRdfLanguage(), builder);
this.parserThread = new Thread(parserRunnable);
this.parserThread.setDaemon(true);
this.parserThread.start();
}
use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project asterixdb by apache.
the class HDFSReadOperatorDescriptor method createPushRuntime.
@Override
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions) throws HyracksDataException {
final List<FileSplit> inputSplits = splitsFactory.getSplits();
return new AbstractUnaryOutputSourceOperatorNodePushable() {
private String nodeName = ctx.getJobletContext().getServiceContext().getNodeId();
private ContextFactory ctxFactory = new ContextFactory();
@SuppressWarnings("unchecked")
@Override
public void initialize() throws HyracksDataException {
ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
try {
writer.open();
Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader());
Job job = confFactory.getConf();
job.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader());
IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx);
InputFormat inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
int size = inputSplits.size();
for (int i = 0; i < size; i++) {
/**
* read all the partitions scheduled to the current node
*/
if (scheduledLocations[i].equals(nodeName)) {
/**
* pick an unread split to read synchronize among
* simultaneous partitions in the same machine
*/
synchronized (executed) {
if (executed[i] == false) {
executed[i] = true;
} else {
continue;
}
}
/**
* read the split
*/
TaskAttemptContext context = ctxFactory.createContext(job.getConfiguration(), i);
context.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader());
RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context);
reader.initialize(inputSplits.get(i), context);
while (reader.nextKeyValue() == true) {
parser.parse(reader.getCurrentKey(), reader.getCurrentValue(), writer, inputSplits.get(i).toString());
}
}
}
parser.close(writer);
} catch (Throwable th) {
writer.fail();
throw new HyracksDataException(th);
} finally {
writer.close();
Thread.currentThread().setContextClassLoader(ctxCL);
}
}
};
}
use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project asterixdb by apache.
the class SchedulerTest method testSchedulerSmallerHDFS.
/**
* Test the case where the HDFS cluster is a larger than the Hyracks cluster
*
* @throws Exception
*/
public void testSchedulerSmallerHDFS() throws Exception {
Map<String, NodeControllerInfo> ncNameToNcInfos = TestUtils.generateNodeControllerInfo(6, "nc", "10.0.0.", 5099, 5098, 5097);
List<InputSplit> fileSplits = new ArrayList<>();
fileSplits.add(new FileSplit(new Path("part-1"), 0, 0, new String[] { "10.0.0.1", "10.0.0.2", "10.0.0.3" }));
fileSplits.add(new FileSplit(new Path("part-2"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" }));
fileSplits.add(new FileSplit(new Path("part-3"), 0, 0, new String[] { "10.0.0.4", "10.0.0.5", "10.0.0.3" }));
fileSplits.add(new FileSplit(new Path("part-4"), 0, 0, new String[] { "10.0.0.2", "10.0.0.1", "10.0.0.3" }));
fileSplits.add(new FileSplit(new Path("part-5"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" }));
fileSplits.add(new FileSplit(new Path("part-6"), 0, 0, new String[] { "10.0.0.2", "10.0.0.3", "10.0.0.5" }));
fileSplits.add(new FileSplit(new Path("part-7"), 0, 0, new String[] { "10.0.0.1", "10.0.0.2", "10.0.0.3" }));
fileSplits.add(new FileSplit(new Path("part-8"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" }));
fileSplits.add(new FileSplit(new Path("part-9"), 0, 0, new String[] { "10.0.0.4", "10.0.0.5", "10.0.0.1" }));
fileSplits.add(new FileSplit(new Path("part-10"), 0, 0, new String[] { "10.0.0.2", "10.0.0.1", "10.0.0.2" }));
fileSplits.add(new FileSplit(new Path("part-11"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" }));
fileSplits.add(new FileSplit(new Path("part-12"), 0, 0, new String[] { "10.0.0.2", "10.0.0.3", "10.0.0.5" }));
Scheduler scheduler = new Scheduler(ncNameToNcInfos);
String[] locationConstraints = scheduler.getLocationConstraints(fileSplits);
String[] expectedResults = new String[] { "nc1", "nc4", "nc4", "nc1", "nc3", "nc2", "nc2", "nc3", "nc5", "nc6", "nc5", "nc6" };
for (int i = 0; i < locationConstraints.length; i++) {
Assert.assertEquals(locationConstraints[i], expectedResults[i]);
}
}
use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project carbondata by apache.
the class CarbonTableReader method getSegmentAbstractIndexs.
private Map<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> getSegmentAbstractIndexs(/*JobContext job,*/
AbsoluteTableIdentifier absoluteTableIdentifier, CarbonTablePath tablePath, String segmentId, CacheClient cacheClient, SegmentUpdateStatusManager updateStatusManager) throws IOException {
Map<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> segmentIndexMap = null;
SegmentTaskIndexWrapper segmentTaskIndexWrapper = null;
boolean isSegmentUpdated = false;
Set<SegmentTaskIndexStore.TaskBucketHolder> taskKeys = null;
TableSegmentUniqueIdentifier tableSegmentUniqueIdentifier = new TableSegmentUniqueIdentifier(absoluteTableIdentifier, segmentId);
segmentTaskIndexWrapper = cacheClient.getSegmentAccessClient().getIfPresent(tableSegmentUniqueIdentifier);
UpdateVO updateDetails = updateStatusManager.getInvalidTimestampRange(segmentId);
if (null != segmentTaskIndexWrapper) {
segmentIndexMap = segmentTaskIndexWrapper.getTaskIdToTableSegmentMap();
if (isSegmentUpdate(segmentTaskIndexWrapper, updateDetails)) {
taskKeys = segmentIndexMap.keySet();
isSegmentUpdated = true;
}
}
// if segment tree is not loaded, load the segment tree
if (segmentIndexMap == null || isSegmentUpdated) {
List<FileStatus> fileStatusList = new LinkedList<FileStatus>();
List<String> segs = new ArrayList<>();
segs.add(segmentId);
FileSystem fs = getFileStatusOfSegments(new String[] { segmentId }, tablePath, fileStatusList);
List<InputSplit> splits = getSplit(fileStatusList, fs);
List<FileSplit> carbonSplits = new ArrayList<>();
for (InputSplit inputSplit : splits) {
FileSplit fileSplit = (FileSplit) inputSplit;
String segId = CarbonTablePath.DataPathUtil.getSegmentId(//这里的seperator应该怎么加??
fileSplit.getPath().toString());
if (segId.equals(CarbonCommonConstants.INVALID_SEGMENT_ID)) {
continue;
}
carbonSplits.add(fileSplit);
}
List<TableBlockInfo> tableBlockInfoList = new ArrayList<>();
for (FileSplit inputSplit : carbonSplits) {
if (isValidBlockBasedOnUpdateDetails(taskKeys, inputSplit, updateDetails, updateStatusManager, segmentId)) {
BlockletInfos blockletInfos = new BlockletInfos(0, 0, //this level we do not need blocklet info!!!! Is this a trick?
0);
tableBlockInfoList.add(new TableBlockInfo(inputSplit.getPath().toString(), inputSplit.getStart(), segmentId, inputSplit.getLocations(), inputSplit.getLength(), blockletInfos, ColumnarFormatVersion.valueOf(CarbonCommonConstants.CARBON_DATA_FILE_DEFAULT_VERSION), null));
}
}
Map<String, List<TableBlockInfo>> segmentToTableBlocksInfos = new HashMap<>();
segmentToTableBlocksInfos.put(segmentId, tableBlockInfoList);
// get Btree blocks for given segment
tableSegmentUniqueIdentifier.setSegmentToTableBlocksInfos(segmentToTableBlocksInfos);
tableSegmentUniqueIdentifier.setIsSegmentUpdated(isSegmentUpdated);
segmentTaskIndexWrapper = cacheClient.getSegmentAccessClient().get(tableSegmentUniqueIdentifier);
segmentIndexMap = segmentTaskIndexWrapper.getTaskIdToTableSegmentMap();
}
return segmentIndexMap;
}
use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hadoop by apache.
the class UniformSizeInputFormat method getSplits.
private List<InputSplit> getSplits(Configuration configuration, int numSplits, long totalSizeBytes) throws IOException {
List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
long nBytesPerSplit = (long) Math.ceil(totalSizeBytes * 1.0 / numSplits);
CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
Text srcRelPath = new Text();
long currentSplitSize = 0;
long lastSplitStart = 0;
long lastPosition = 0;
final Path listingFilePath = getListingFilePath(configuration);
if (LOG.isDebugEnabled()) {
LOG.debug("Average bytes per map: " + nBytesPerSplit + ", Number of maps: " + numSplits + ", total size: " + totalSizeBytes);
}
SequenceFile.Reader reader = null;
try {
reader = getListingFileReader(configuration);
while (reader.next(srcRelPath, srcFileStatus)) {
// limit. Add the current file to new split
if (currentSplitSize + srcFileStatus.getLen() > nBytesPerSplit && lastPosition != 0) {
FileSplit split = new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null);
if (LOG.isDebugEnabled()) {
LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize);
}
splits.add(split);
lastSplitStart = lastPosition;
currentSplitSize = 0;
}
currentSplitSize += srcFileStatus.getLen();
lastPosition = reader.getPosition();
}
if (lastPosition > lastSplitStart) {
FileSplit split = new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null);
if (LOG.isDebugEnabled()) {
LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize);
}
splits.add(split);
}
} finally {
IOUtils.closeStream(reader);
}
return splits;
}
Aggregations