Examples with FileSplit - org.apache.hadoop.mapred.FileSplit

Example 56 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class CustomPartitionVertex method getFileSplitFromEvent.

private FileSplit getFileSplitFromEvent(InputDataInformationEvent event) throws IOException {
    InputSplit inputSplit = null;
    if (event.getDeserializedUserPayload() != null) {
        inputSplit = (InputSplit) event.getDeserializedUserPayload();
    } else {
        MRSplitProto splitProto = MRSplitProto.parseFrom(ByteString.copyFrom(event.getUserPayload()));
        SerializationFactory serializationFactory = new SerializationFactory(new Configuration());
        inputSplit = MRInputHelpers.createOldFormatSplitFromUserPayload(splitProto, serializationFactory);
    }
    if (!(inputSplit instanceof FileSplit)) {
        throw new UnsupportedOperationException("Cannot handle splits other than FileSplit for the moment. Current input split type: " + inputSplit.getClass().getSimpleName());
    }
    return (FileSplit) inputSplit;
}

Also used : Configuration(org.apache.hadoop.conf.Configuration) SerializationFactory(org.apache.hadoop.io.serializer.SerializationFactory) FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) MRSplitProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto)

Example 57 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class CustomPartitionVertex method onRootVertexInitialized.

// One call per root Input
@Override
public void onRootVertexInitialized(String inputName, InputDescriptor inputDescriptor, List<Event> events) {
    numInputsSeenSoFar++;
    LOG.info("On root vertex initialized " + inputName);
    try {
        // This is using the payload from the RootVertexInitializer corresponding
        // to InputName. Ideally it should be using it's own configuration class -
        // but that
        // means serializing another instance.
        MRInputUserPayloadProto protoPayload = MRInputHelpers.parseMRInputPayload(inputDescriptor.getUserPayload());
        this.conf = TezUtils.createConfFromByteString(protoPayload.getConfigurationBytes());
        /*
       * Currently in tez, the flow of events is thus:
       * "Generate Splits -> Initialize Vertex" (with parallelism info obtained
       * from the generate splits phase). The generate splits phase groups
       * splits using the TezGroupedSplitsInputFormat. However, for bucket map
       * joins the grouping done by this input format results in incorrect
       * results as the grouper has no knowledge of buckets. So, we initially
       * set the input format to be HiveInputFormat (in DagUtils) for the case
       * of bucket map joins so as to obtain un-grouped splits. We then group
       * the splits corresponding to buckets using the tez grouper which returns
       * TezGroupedSplits.
       */
        // This assumes that Grouping will always be used.
        // Enabling grouping on the payload.
        MRInputUserPayloadProto updatedPayload = MRInputUserPayloadProto.newBuilder(protoPayload).setGroupingEnabled(true).build();
        inputDescriptor.setUserPayload(UserPayload.create(updatedPayload.toByteString().asReadOnlyByteBuffer()));
    } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
    boolean dataInformationEventSeen = false;
    Map<String, Set<FileSplit>> pathFileSplitsMap = new TreeMap<String, Set<FileSplit>>();
    for (Event event : events) {
        if (event instanceof InputConfigureVertexTasksEvent) {
            // No tasks should have been started yet. Checked by initial state
            // check.
            LOG.info("Got a input configure vertex event for input: " + inputName);
            Preconditions.checkState(dataInformationEventSeen == false);
            InputConfigureVertexTasksEvent cEvent = (InputConfigureVertexTasksEvent) event;
            // The vertex cannot be configured until all DataEvents are seen - to
            // build the routing table.
            configureVertexTaskEvent = cEvent;
            LOG.info("Configure task for input name: " + inputName + " num tasks: " + configureVertexTaskEvent.getNumTasks());
        }
        if (event instanceof InputUpdatePayloadEvent) {
            // this event can never occur. If it does, fail.
            Preconditions.checkState(false);
        } else if (event instanceof InputDataInformationEvent) {
            dataInformationEventSeen = true;
            InputDataInformationEvent diEvent = (InputDataInformationEvent) event;
            FileSplit fileSplit;
            try {
                fileSplit = getFileSplitFromEvent(diEvent);
            } catch (IOException e) {
                throw new RuntimeException("Failed to get file split for event: " + diEvent, e);
            }
            Set<FileSplit> fsList = pathFileSplitsMap.get(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()));
            if (fsList == null) {
                fsList = new TreeSet<FileSplit>(new PathComparatorForSplit());
                pathFileSplitsMap.put(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()), fsList);
            }
            fsList.add(fileSplit);
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Path file splits map for input name: " + inputName + " is " + pathFileSplitsMap);
    }
    Multimap<Integer, InputSplit> bucketToInitialSplitMap = getBucketSplitMapForPath(inputName, pathFileSplitsMap);
    try {
        int totalResource = context.getTotalAvailableResource().getMemory();
        int taskResource = context.getVertexTaskResource().getMemory();
        float waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
        int availableSlots = totalResource / taskResource;
        if (LOG.isDebugEnabled()) {
            LOG.debug("Grouping splits. " + availableSlots + " available slots, " + waves + " waves. Bucket initial splits map: " + bucketToInitialSplitMap);
        }
        JobConf jobConf = new JobConf(conf);
        ShimLoader.getHadoopShims().getMergedCredentials(jobConf);
        Multimap<Integer, InputSplit> bucketToGroupedSplitMap = HashMultimap.<Integer, InputSplit>create();
        boolean secondLevelGroupingDone = false;
        if ((mainWorkName.isEmpty()) || (inputName.compareTo(mainWorkName) == 0)) {
            SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
            for (Integer key : bucketToInitialSplitMap.keySet()) {
                InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
                Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, mainWorkName.isEmpty(), splitLocationProvider);
                if (mainWorkName.isEmpty() == false) {
                    Multimap<Integer, InputSplit> singleBucketToGroupedSplit = HashMultimap.<Integer, InputSplit>create();
                    singleBucketToGroupedSplit.putAll(key, groupedSplit.values());
                    groupedSplit = grouper.group(jobConf, singleBucketToGroupedSplit, availableSlots, HiveConf.getFloatVar(conf, HiveConf.ConfVars.TEZ_SMB_NUMBER_WAVES), null);
                    secondLevelGroupingDone = true;
                }
                bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
            }
            processAllEvents(inputName, bucketToGroupedSplitMap, secondLevelGroupingDone);
        } else {
            SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
            // all the bucket files.
            for (Integer key : bucketToInitialSplitMap.keySet()) {
                InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
                Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, false, splitLocationProvider);
                bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
            }
            /*
         * this is the small table side. In case of SMB join, we need to send each split to the
         * corresponding bucket-based task on the other side. In case a split needs to go to
         * multiple downstream tasks, we need to clone the event and send it to the right
         * destination.
         */
            LOG.info("This is the side work - multi-mr work.");
            processAllSideEventsSetParallelism(inputName, bucketToGroupedSplitMap);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

Also used : IOException(java.io.IOException) ByteString(com.google.protobuf.ByteString) FileSplit(org.apache.hadoop.mapred.FileSplit) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) IOException(java.io.IOException) MRInputUserPayloadProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto) Event(org.apache.tez.runtime.api.Event) SplitLocationProvider(org.apache.hadoop.mapred.split.SplitLocationProvider) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf)

Example 58 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class CustomPartitionVertex method getBucketSplitMapForPath.

/*
   * This method generates the map of bucket to file splits.
   */
private Multimap<Integer, InputSplit> getBucketSplitMapForPath(String inputName, Map<String, Set<FileSplit>> pathFileSplitsMap) {
    Multimap<Integer, InputSplit> bucketToInitialSplitMap = ArrayListMultimap.create();
    boolean fallback = false;
    Map<Integer, Integer> bucketIds = new HashMap<>();
    for (Map.Entry<String, Set<FileSplit>> entry : pathFileSplitsMap.entrySet()) {
        // Extract the buckedID from pathFilesMap, this is more accurate method,
        // however. it may not work in certain cases where buckets are named
        // after files used while loading data. In such case, fallback to old
        // potential inaccurate method.
        // The accepted file names are such as 000000_0, 000001_0_copy_1.
        String bucketIdStr = Utilities.getBucketFileNameFromPathSubString(entry.getKey());
        int bucketId = Utilities.getBucketIdFromFile(bucketIdStr);
        if (bucketId == -1) {
            fallback = true;
            LOG.info("Fallback to using older sort based logic to assign " + "buckets to splits.");
            bucketIds.clear();
            break;
        }
        // Make sure the bucketId is at max the numBuckets
        bucketId = bucketId % numBuckets;
        bucketIds.put(bucketId, bucketId);
        for (FileSplit fsplit : entry.getValue()) {
            bucketToInitialSplitMap.put(bucketId, fsplit);
        }
    }
    int bucketNum = 0;
    if (fallback) {
        // alphanumeric order and mapped to appropriate bucket number.
        for (Map.Entry<String, Set<FileSplit>> entry : pathFileSplitsMap.entrySet()) {
            int bucketId = bucketNum % numBuckets;
            for (FileSplit fsplit : entry.getValue()) {
                bucketToInitialSplitMap.put(bucketId, fsplit);
            }
            bucketNum++;
        }
    }
    // well.
    if (numInputsAffectingRootInputSpecUpdate != 1) {
        // small table
        if (fallback && bucketNum < numBuckets) {
            // Old logic.
            int loopedBucketId = 0;
            for (; bucketNum < numBuckets; bucketNum++) {
                for (InputSplit fsplit : bucketToInitialSplitMap.get(loopedBucketId)) {
                    bucketToInitialSplitMap.put(bucketNum, fsplit);
                }
                loopedBucketId++;
            }
        } else {
            // new logic.
            if (inputToBucketMap.containsKey(inputName)) {
                int inputNumBuckets = inputToBucketMap.get(inputName);
                if (inputNumBuckets < numBuckets) {
                    // Need to send the splits to multiple buckets
                    for (int i = 1; i < numBuckets / inputNumBuckets; i++) {
                        int bucketIdBase = i * inputNumBuckets;
                        for (Integer bucketId : bucketIds.keySet()) {
                            for (InputSplit fsplit : bucketToInitialSplitMap.get(bucketId)) {
                                bucketToInitialSplitMap.put(bucketIdBase + bucketId, fsplit);
                            }
                        }
                    }
                }
            }
        }
    }
    return bucketToInitialSplitMap;
}

Also used : ByteString(com.google.protobuf.ByteString) FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint)

Example 59 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class HiveSplitGenerator method initialize.

@SuppressWarnings("unchecked")
@Override
public List<Event> initialize() throws Exception {
    // Setup the map work for this thread. Pruning modified the work instance to potentially remove
    // partitions. The same work instance must be used when generating splits.
    Utilities.setMapWork(jobConf, work);
    try {
        boolean sendSerializedEvents = conf.getBoolean("mapreduce.tez.input.initializer.serialize.event.payload", true);
        // perform dynamic partition pruning
        if (pruner != null) {
            pruner.prune();
        }
        InputSplitInfoMem inputSplitInfo = null;
        boolean generateConsistentSplits = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS);
        LOG.info("GenerateConsistentSplitsInHive=" + generateConsistentSplits);
        String realInputFormatName = conf.get("mapred.input.format.class");
        boolean groupingEnabled = userPayloadProto.getGroupingEnabled();
        if (groupingEnabled) {
            // Need to instantiate the realInputFormat
            InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) ReflectionUtils.newInstance(JavaUtils.loadClass(realInputFormatName), jobConf);
            int totalResource = 0;
            int taskResource = 0;
            int availableSlots = 0;
            // FIXME. Do the right thing Luke.
            if (getContext() == null) {
                // for now, totalResource = taskResource for llap
                availableSlots = 1;
            }
            if (getContext() != null) {
                totalResource = getContext().getTotalAvailableResource().getMemory();
                taskResource = getContext().getVertexTaskResource().getMemory();
                availableSlots = totalResource / taskResource;
            }
            if (HiveConf.getLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1) <= 1) {
                // broken configuration from mapred-default.xml
                final long blockSize = conf.getLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT);
                final long minGrouping = conf.getLong(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE_DEFAULT);
                final long preferredSplitSize = Math.min(blockSize / 2, minGrouping);
                HiveConf.setLongVar(jobConf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, preferredSplitSize);
                LOG.info("The preferred split size is " + preferredSplitSize);
            }
            // Create the un-grouped splits
            float waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
            // Raw splits
            InputSplit[] splits = inputFormat.getSplits(jobConf, (int) (availableSlots * waves));
            // Sort the splits, so that subsequent grouping is consistent.
            Arrays.sort(splits, new InputSplitComparator());
            LOG.info("Number of input splits: " + splits.length + ". " + availableSlots + " available slots, " + waves + " waves. Input format is: " + realInputFormatName);
            // increment/set input counters
            InputInitializerContext inputInitializerContext = getContext();
            TezCounters tezCounters = null;
            String counterName;
            String groupName = null;
            String vertexName = null;
            if (inputInitializerContext != null) {
                tezCounters = new TezCounters();
                groupName = HiveInputCounters.class.getName();
                vertexName = jobConf.get(Operator.CONTEXT_NAME_KEY, "");
                counterName = Utilities.getVertexCounterName(HiveInputCounters.RAW_INPUT_SPLITS.name(), vertexName);
                tezCounters.findCounter(groupName, counterName).increment(splits.length);
                final List<Path> paths = Utilities.getInputPathsTez(jobConf, work);
                counterName = Utilities.getVertexCounterName(HiveInputCounters.INPUT_DIRECTORIES.name(), vertexName);
                tezCounters.findCounter(groupName, counterName).increment(paths.size());
                final Set<String> files = new HashSet<>();
                for (InputSplit inputSplit : splits) {
                    if (inputSplit instanceof FileSplit) {
                        final FileSplit fileSplit = (FileSplit) inputSplit;
                        final Path path = fileSplit.getPath();
                        // The assumption here is the path is a file. Only case this is different is ACID deltas.
                        // The isFile check is avoided here for performance reasons.
                        final String fileStr = path.toString();
                        if (!files.contains(fileStr)) {
                            files.add(fileStr);
                        }
                    }
                }
                counterName = Utilities.getVertexCounterName(HiveInputCounters.INPUT_FILES.name(), vertexName);
                tezCounters.findCounter(groupName, counterName).increment(files.size());
            }
            if (work.getIncludedBuckets() != null) {
                splits = pruneBuckets(work, splits);
            }
            Multimap<Integer, InputSplit> groupedSplits = splitGrouper.generateGroupedSplits(jobConf, conf, splits, waves, availableSlots, splitLocationProvider);
            // And finally return them in a flat array
            InputSplit[] flatSplits = groupedSplits.values().toArray(new InputSplit[0]);
            LOG.info("Number of split groups: " + flatSplits.length);
            if (inputInitializerContext != null) {
                counterName = Utilities.getVertexCounterName(HiveInputCounters.GROUPED_INPUT_SPLITS.name(), vertexName);
                tezCounters.findCounter(groupName, counterName).setValue(flatSplits.length);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Published tez counters: " + tezCounters);
                }
                inputInitializerContext.addCounters(tezCounters);
            }
            List<TaskLocationHint> locationHints = splitGrouper.createTaskLocationHints(flatSplits, generateConsistentSplits);
            inputSplitInfo = new InputSplitInfoMem(flatSplits, locationHints, flatSplits.length, null, jobConf);
        } else {
            // If this is used in the future - make sure to disable grouping in the payload, if it isn't already disabled
            throw new RuntimeException("HiveInputFormat does not support non-grouped splits, InputFormatName is: " + realInputFormatName);
        // inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(jobConf, false, 0);
        }
        return createEventList(sendSerializedEvents, inputSplitInfo);
    } finally {
        Utilities.clearWork(jobConf);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) FileSplit(org.apache.hadoop.mapred.FileSplit) InputInitializerContext(org.apache.tez.runtime.api.InputInitializerContext) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) TezCounters(org.apache.tez.common.counters.TezCounters) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) InputSplitInfoMem(org.apache.tez.mapreduce.hadoop.InputSplitInfoMem) InputFormat(org.apache.hadoop.mapred.InputFormat) InputSplit(org.apache.hadoop.mapred.InputSplit) HashSet(java.util.HashSet)

Example 60 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class HostAffinitySplitLocationProvider method getLocations.

@Override
public String[] getLocations(InputSplit split) throws IOException {
    if (!(split instanceof FileSplit)) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Split: " + split + " is not a FileSplit. Using default locations");
        }
        return split.getLocations();
    }
    FileSplit fsplit = (FileSplit) split;
    String splitDesc = "Split at " + fsplit.getPath() + " with offset= " + fsplit.getStart() + ", length=" + fsplit.getLength();
    String location = locations.get(determineLocation(locations, fsplit.getPath().toString(), fsplit.getStart(), splitDesc));
    return (location != null) ? new String[] { location } : null;
}

Also used : FileSplit(org.apache.hadoop.mapred.FileSplit)

Aggregations

FileSplit (org.apache.hadoop.mapred.FileSplit)66 Path (org.apache.hadoop.fs.Path)38 InputSplit (org.apache.hadoop.mapred.InputSplit)23 JobConf (org.apache.hadoop.mapred.JobConf)16 File (java.io.File)10 IOException (java.io.IOException)10 Configuration (org.apache.hadoop.conf.Configuration)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 FileSystem (org.apache.hadoop.fs.FileSystem)10 Test (org.junit.Test)9 RecordReader (org.apache.hadoop.mapred.RecordReader)8 ArrayList (java.util.ArrayList)7 Properties (java.util.Properties)7 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)7 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)5 NullWritable (org.apache.hadoop.io.NullWritable)5 InputFormat (org.apache.hadoop.mapred.InputFormat)4 NodeControllerInfo (org.apache.hyracks.api.client.NodeControllerInfo)4 ClusterTopology (org.apache.hyracks.api.topology.ClusterTopology)4 VertexLocationHint (org.apache.tez.dag.api.VertexLocationHint)4