Examples with FileSplit - org.apache.hadoop.mapreduce.lib.input.FileSplit

Example 31 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project jena by apache.

the class AbstractWholeFileNodeTupleReader method initialize.

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    LOG.debug("initialize({}, {})", genericSplit, context);
    // Assuming file split
    if (!(genericSplit instanceof FileSplit))
        throw new IOException("This record reader only supports FileSplit inputs");
    FileSplit split = (FileSplit) genericSplit;
    // Configuration
    Configuration config = context.getConfiguration();
    this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true);
    if (this.ignoreBadTuples)
        LOG.warn("Configured to ignore bad tuples, parsing errors will be logged and further parsing aborted but no user visible errors will be thrown.  Consider setting {} to false to disable this behaviour", RdfIOConstants.INPUT_IGNORE_BAD_TUPLES);
    // Figure out what portion of the file to read
    if (split.getStart() > 0)
        throw new IOException("This record reader requires a file split which covers the entire file");
    final Path file = split.getPath();
    long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen();
    CompressionCodecFactory factory = new CompressionCodecFactory(config);
    this.compressionCodecs = factory.getCodec(file);
    LOG.info(String.format("Got split with start %d and length %d for file with total length of %d", new Object[] { split.getStart(), split.getLength(), totalLength }));
    if (totalLength > split.getLength())
        throw new IOException("This record reader requires a file split which covers the entire file");
    // Open the file and prepare the input stream
    FileSystem fs = file.getFileSystem(config);
    FSDataInputStream fileIn = fs.open(file);
    this.length = split.getLength();
    if (this.compressionCodecs != null) {
        // Compressed input
        input = new TrackedInputStream(this.compressionCodecs.createInputStream(fileIn));
    } else {
        // Uncompressed input
        input = new TrackedInputStream(fileIn);
    }
    // Set up background thread for parser
    iter = this.getPipedIterator();
    this.stream = this.getPipedStream(iter, this.input);
    RDFParserBuilder builder = RdfIOUtils.createRDFParserBuilder(context, file);
    Runnable parserRunnable = this.createRunnable(this, this.input, stream, this.getRdfLanguage(), builder);
    this.parserThread = new Thread(parserRunnable);
    this.parserThread.setDaemon(true);
    this.parserThread.start();
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) FileSystem(org.apache.hadoop.fs.FileSystem) RDFParserBuilder(org.apache.jena.riot.RDFParserBuilder) TrackedInputStream(org.apache.jena.hadoop.rdf.io.input.util.TrackedInputStream) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Example 32 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project asterixdb by apache.

the class HDFSReadOperatorDescriptor method createPushRuntime.

@Override
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions) throws HyracksDataException {
    final List<FileSplit> inputSplits = splitsFactory.getSplits();
    return new AbstractUnaryOutputSourceOperatorNodePushable() {

        private String nodeName = ctx.getJobletContext().getServiceContext().getNodeId();

        private ContextFactory ctxFactory = new ContextFactory();

        @SuppressWarnings("unchecked")
        @Override
        public void initialize() throws HyracksDataException {
            ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
            try {
                writer.open();
                Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader());
                Job job = confFactory.getConf();
                job.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader());
                IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx);
                InputFormat inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
                int size = inputSplits.size();
                for (int i = 0; i < size; i++) {
                    /**
                         * read all the partitions scheduled to the current node
                         */
                    if (scheduledLocations[i].equals(nodeName)) {
                        /**
                             * pick an unread split to read synchronize among
                             * simultaneous partitions in the same machine
                             */
                        synchronized (executed) {
                            if (executed[i] == false) {
                                executed[i] = true;
                            } else {
                                continue;
                            }
                        }
                        /**
                             * read the split
                             */
                        TaskAttemptContext context = ctxFactory.createContext(job.getConfiguration(), i);
                        context.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader());
                        RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context);
                        reader.initialize(inputSplits.get(i), context);
                        while (reader.nextKeyValue() == true) {
                            parser.parse(reader.getCurrentKey(), reader.getCurrentValue(), writer, inputSplits.get(i).toString());
                        }
                    }
                }
                parser.close(writer);
            } catch (Throwable th) {
                writer.fail();
                throw new HyracksDataException(th);
            } finally {
                writer.close();
                Thread.currentThread().setContextClassLoader(ctxCL);
            }
        }
    };
}

Also used : AbstractUnaryOutputSourceOperatorNodePushable(org.apache.hyracks.dataflow.std.base.AbstractUnaryOutputSourceOperatorNodePushable) RecordReader(org.apache.hadoop.mapreduce.RecordReader) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException) ContextFactory(org.apache.hyracks.hdfs.ContextFactory) IKeyValueParser(org.apache.hyracks.hdfs.api.IKeyValueParser) InputFormat(org.apache.hadoop.mapreduce.InputFormat) Job(org.apache.hadoop.mapreduce.Job)

Example 33 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project asterixdb by apache.

the class SchedulerTest method testSchedulerSmallerHDFS.

/**
     * Test the case where the HDFS cluster is a larger than the Hyracks cluster
     *
     * @throws Exception
     */
public void testSchedulerSmallerHDFS() throws Exception {
    Map<String, NodeControllerInfo> ncNameToNcInfos = TestUtils.generateNodeControllerInfo(6, "nc", "10.0.0.", 5099, 5098, 5097);
    List<InputSplit> fileSplits = new ArrayList<>();
    fileSplits.add(new FileSplit(new Path("part-1"), 0, 0, new String[] { "10.0.0.1", "10.0.0.2", "10.0.0.3" }));
    fileSplits.add(new FileSplit(new Path("part-2"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" }));
    fileSplits.add(new FileSplit(new Path("part-3"), 0, 0, new String[] { "10.0.0.4", "10.0.0.5", "10.0.0.3" }));
    fileSplits.add(new FileSplit(new Path("part-4"), 0, 0, new String[] { "10.0.0.2", "10.0.0.1", "10.0.0.3" }));
    fileSplits.add(new FileSplit(new Path("part-5"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" }));
    fileSplits.add(new FileSplit(new Path("part-6"), 0, 0, new String[] { "10.0.0.2", "10.0.0.3", "10.0.0.5" }));
    fileSplits.add(new FileSplit(new Path("part-7"), 0, 0, new String[] { "10.0.0.1", "10.0.0.2", "10.0.0.3" }));
    fileSplits.add(new FileSplit(new Path("part-8"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" }));
    fileSplits.add(new FileSplit(new Path("part-9"), 0, 0, new String[] { "10.0.0.4", "10.0.0.5", "10.0.0.1" }));
    fileSplits.add(new FileSplit(new Path("part-10"), 0, 0, new String[] { "10.0.0.2", "10.0.0.1", "10.0.0.2" }));
    fileSplits.add(new FileSplit(new Path("part-11"), 0, 0, new String[] { "10.0.0.3", "10.0.0.4", "10.0.0.5" }));
    fileSplits.add(new FileSplit(new Path("part-12"), 0, 0, new String[] { "10.0.0.2", "10.0.0.3", "10.0.0.5" }));
    Scheduler scheduler = new Scheduler(ncNameToNcInfos);
    String[] locationConstraints = scheduler.getLocationConstraints(fileSplits);
    String[] expectedResults = new String[] { "nc1", "nc4", "nc4", "nc1", "nc3", "nc2", "nc2", "nc3", "nc5", "nc6", "nc5", "nc6" };
    for (int i = 0; i < locationConstraints.length; i++) {
        Assert.assertEquals(locationConstraints[i], expectedResults[i]);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) NodeControllerInfo(org.apache.hyracks.api.client.NodeControllerInfo) ArrayList(java.util.ArrayList) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 34 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project carbondata by apache.

the class CarbonTableReader method getSegmentAbstractIndexs.

private Map<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> getSegmentAbstractIndexs(/*JobContext job,*/
AbsoluteTableIdentifier absoluteTableIdentifier, CarbonTablePath tablePath, String segmentId, CacheClient cacheClient, SegmentUpdateStatusManager updateStatusManager) throws IOException {
    Map<SegmentTaskIndexStore.TaskBucketHolder, AbstractIndex> segmentIndexMap = null;
    SegmentTaskIndexWrapper segmentTaskIndexWrapper = null;
    boolean isSegmentUpdated = false;
    Set<SegmentTaskIndexStore.TaskBucketHolder> taskKeys = null;
    TableSegmentUniqueIdentifier tableSegmentUniqueIdentifier = new TableSegmentUniqueIdentifier(absoluteTableIdentifier, segmentId);
    segmentTaskIndexWrapper = cacheClient.getSegmentAccessClient().getIfPresent(tableSegmentUniqueIdentifier);
    UpdateVO updateDetails = updateStatusManager.getInvalidTimestampRange(segmentId);
    if (null != segmentTaskIndexWrapper) {
        segmentIndexMap = segmentTaskIndexWrapper.getTaskIdToTableSegmentMap();
        if (isSegmentUpdate(segmentTaskIndexWrapper, updateDetails)) {
            taskKeys = segmentIndexMap.keySet();
            isSegmentUpdated = true;
        }
    }
    // if segment tree is not loaded, load the segment tree
    if (segmentIndexMap == null || isSegmentUpdated) {
        List<FileStatus> fileStatusList = new LinkedList<FileStatus>();
        List<String> segs = new ArrayList<>();
        segs.add(segmentId);
        FileSystem fs = getFileStatusOfSegments(new String[] { segmentId }, tablePath, fileStatusList);
        List<InputSplit> splits = getSplit(fileStatusList, fs);
        List<FileSplit> carbonSplits = new ArrayList<>();
        for (InputSplit inputSplit : splits) {
            FileSplit fileSplit = (FileSplit) inputSplit;
            String segId = CarbonTablePath.DataPathUtil.getSegmentId(//这里的seperator应该怎么加？？
            fileSplit.getPath().toString());
            if (segId.equals(CarbonCommonConstants.INVALID_SEGMENT_ID)) {
                continue;
            }
            carbonSplits.add(fileSplit);
        }
        List<TableBlockInfo> tableBlockInfoList = new ArrayList<>();
        for (FileSplit inputSplit : carbonSplits) {
            if (isValidBlockBasedOnUpdateDetails(taskKeys, inputSplit, updateDetails, updateStatusManager, segmentId)) {
                BlockletInfos blockletInfos = new BlockletInfos(0, 0, //this level we do not need blocklet info!!!! Is this a trick?
                0);
                tableBlockInfoList.add(new TableBlockInfo(inputSplit.getPath().toString(), inputSplit.getStart(), segmentId, inputSplit.getLocations(), inputSplit.getLength(), blockletInfos, ColumnarFormatVersion.valueOf(CarbonCommonConstants.CARBON_DATA_FILE_DEFAULT_VERSION), null));
            }
        }
        Map<String, List<TableBlockInfo>> segmentToTableBlocksInfos = new HashMap<>();
        segmentToTableBlocksInfos.put(segmentId, tableBlockInfoList);
        // get Btree blocks for given segment
        tableSegmentUniqueIdentifier.setSegmentToTableBlocksInfos(segmentToTableBlocksInfos);
        tableSegmentUniqueIdentifier.setIsSegmentUpdated(isSegmentUpdated);
        segmentTaskIndexWrapper = cacheClient.getSegmentAccessClient().get(tableSegmentUniqueIdentifier);
        segmentIndexMap = segmentTaskIndexWrapper.getTaskIdToTableSegmentMap();
    }
    return segmentIndexMap;
}

Also used : ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) UpdateVO(org.apache.carbondata.core.mutate.UpdateVO) ImmutableList(com.google.common.collect.ImmutableList) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 35 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hadoop by apache.

the class UniformSizeInputFormat method getSplits.

private List<InputSplit> getSplits(Configuration configuration, int numSplits, long totalSizeBytes) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
    long nBytesPerSplit = (long) Math.ceil(totalSizeBytes * 1.0 / numSplits);
    CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
    Text srcRelPath = new Text();
    long currentSplitSize = 0;
    long lastSplitStart = 0;
    long lastPosition = 0;
    final Path listingFilePath = getListingFilePath(configuration);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Average bytes per map: " + nBytesPerSplit + ", Number of maps: " + numSplits + ", total size: " + totalSizeBytes);
    }
    SequenceFile.Reader reader = null;
    try {
        reader = getListingFileReader(configuration);
        while (reader.next(srcRelPath, srcFileStatus)) {
            // limit. Add the current file to new split
            if (currentSplitSize + srcFileStatus.getLen() > nBytesPerSplit && lastPosition != 0) {
                FileSplit split = new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize);
                }
                splits.add(split);
                lastSplitStart = lastPosition;
                currentSplitSize = 0;
            }
            currentSplitSize += srcFileStatus.getLen();
            lastPosition = reader.getPosition();
        }
        if (lastPosition > lastSplitStart) {
            FileSplit split = new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize);
            }
            splits.add(split);
        }
    } finally {
        IOUtils.closeStream(reader);
    }
    return splits;
}

Also used : Path(org.apache.hadoop.fs.Path) SequenceFile(org.apache.hadoop.io.SequenceFile) CopyListingFileStatus(org.apache.hadoop.tools.CopyListingFileStatus) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Aggregations

FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)39 Path (org.apache.hadoop.fs.Path)22 Configuration (org.apache.hadoop.conf.Configuration)13 InputSplit (org.apache.hadoop.mapreduce.InputSplit)12 IOException (java.io.IOException)10 ArrayList (java.util.ArrayList)10 FileSystem (org.apache.hadoop.fs.FileSystem)7 BSONFileSplit (com.mongodb.hadoop.input.BSONFileSplit)4 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)4 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)4 Text (org.apache.hadoop.io.Text)3 NodeControllerInfo (org.apache.hyracks.api.client.NodeControllerInfo)3 BSONSplitter (com.mongodb.hadoop.splitter.BSONSplitter)2 ByteArrayInputStream (java.io.ByteArrayInputStream)2 File (java.io.File)2 Constructor (java.lang.reflect.Constructor)2 Schema (org.apache.avro.Schema)2 AvroKeyRecordReader (org.apache.avro.mapreduce.AvroKeyRecordReader)2 FileSplitPartitionQuery (org.apache.gora.query.impl.FileSplitPartitionQuery)2 FileStatus (org.apache.hadoop.fs.FileStatus)2