Search in sources :

Example 56 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project apex-malhar by apache.

the class MapOperator method definePartitions.

@SuppressWarnings("rawtypes")
@Override
public Collection<Partition<MapOperator<K1, V1, K2, V2>>> definePartitions(Collection<Partition<MapOperator<K1, V1, K2, V2>>> partitions, PartitioningContext context) {
    int tempPartitionCount = partitionCount;
    Collection c = partitions;
    Collection<Partition<MapOperator<K1, V1, K2, V2>>> operatorPartitions = c;
    Partition<MapOperator<K1, V1, K2, V2>> template;
    Iterator<Partition<MapOperator<K1, V1, K2, V2>>> itr = operatorPartitions.iterator();
    template = itr.next();
    Configuration conf = new Configuration();
    SerializationFactory serializationFactory = new SerializationFactory(conf);
    if (outstream.size() == 0) {
        InputSplit[] splits;
        try {
            splits = getSplits(new JobConf(conf), tempPartitionCount, template.getPartitionedInstance().getDirName());
        } catch (Exception e1) {
            logger.info(" can't get splits {}", e1.getMessage());
            throw new RuntimeException(e1);
        }
        Collection<Partition<MapOperator<K1, V1, K2, V2>>> operList = new ArrayList<Partition<MapOperator<K1, V1, K2, V2>>>();
        itr = operatorPartitions.iterator();
        int size = splits.length;
        Serializer keySerializer = serializationFactory.getSerializer(splits[0].getClass());
        while (size > 0 && itr.hasNext()) {
            Partition<MapOperator<K1, V1, K2, V2>> p = itr.next();
            MapOperator<K1, V1, K2, V2> opr = p.getPartitionedInstance();
            opr.setInputFormatClass(inputFormatClass);
            opr.setMapClass(mapClass);
            opr.setCombineClass(combineClass);
            opr.setConfigFile(configFile);
            try {
                keySerializer.open(opr.getOutstream());
                keySerializer.serialize(splits[size - 1]);
                opr.setInputSplitClass(splits[size - 1].getClass());
            } catch (IOException e) {
                logger.info("error while serializing {}", e.getMessage());
            }
            size--;
            operList.add(p);
        }
        while (size > 0) {
            MapOperator<K1, V1, K2, V2> opr = new MapOperator<K1, V1, K2, V2>();
            opr.setInputFormatClass(inputFormatClass);
            opr.setMapClass(mapClass);
            opr.setCombineClass(combineClass);
            opr.setConfigFile(configFile);
            try {
                keySerializer.open(opr.getOutstream());
                keySerializer.serialize(splits[size - 1]);
                opr.setInputSplitClass(splits[size - 1].getClass());
            } catch (IOException e) {
                logger.info("error while serializing {}", e.getMessage());
            }
            size--;
            operList.add(new DefaultPartition<MapOperator<K1, V1, K2, V2>>(opr));
        }
        try {
            keySerializer.close();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return operList;
    }
    return null;
}
Also used : DefaultPartition(com.datatorrent.api.DefaultPartition) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) SerializationFactory(org.apache.hadoop.io.serializer.SerializationFactory) IOException(java.io.IOException) IOException(java.io.IOException) Collection(java.util.Collection) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf) Serializer(org.apache.hadoop.io.serializer.Serializer)

Example 57 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class FetchOperator method getNextSplits.

protected FetchInputFormatSplit[] getNextSplits() throws Exception {
    while (getNextPath()) {
        // not using FileInputFormat.setInputPaths() here because it forces a connection to the
        // default file system - which may or may not be online during pure metadata operations
        job.set("mapred.input.dir", StringUtils.escapeString(currPath.toString()));
        // Fetch operator is not vectorized and as such turn vectorization flag off so that
        // non-vectorized record reader is created below.
        HiveConf.setBoolVar(job, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, false);
        Class<? extends InputFormat> formatter = currDesc.getInputFileFormatClass();
        Utilities.copyTableJobPropertiesToConf(currDesc.getTableDesc(), job);
        InputFormat inputFormat = getInputFormatFromCache(formatter, job);
        String inputs = processCurrPathForMmWriteIds(inputFormat);
        if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
            Utilities.FILE_OP_LOGGER.trace("Setting fetch inputs to " + inputs);
        }
        if (inputs == null)
            return null;
        job.set("mapred.input.dir", inputs);
        InputSplit[] splits = inputFormat.getSplits(job, 1);
        FetchInputFormatSplit[] inputSplits = new FetchInputFormatSplit[splits.length];
        for (int i = 0; i < splits.length; i++) {
            inputSplits[i] = new FetchInputFormatSplit(splits[i], inputFormat);
        }
        if (work.getSplitSample() != null) {
            inputSplits = splitSampling(work.getSplitSample(), inputSplits);
        }
        if (inputSplits.length > 0) {
            if (HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVE_IN_TEST)) {
                Arrays.sort(inputSplits, new FetchInputFormatSplitComparator());
            }
            return inputSplits;
        }
    }
    return null;
}
Also used : InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapreduce.lib.input.FileInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 58 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class SplitGrouper method group.

/**
 * group splits for each bucket separately - while evenly filling all the
 * available slots with tasks
 */
public Multimap<Integer, InputSplit> group(Configuration conf, Multimap<Integer, InputSplit> bucketSplitMultimap, int availableSlots, float waves, SplitLocationProvider splitLocationProvider) throws IOException {
    // figure out how many tasks we want for each bucket
    Map<Integer, Integer> bucketTaskMap = estimateBucketSizes(availableSlots, waves, bucketSplitMultimap.asMap());
    // allocate map bucket id to grouped splits
    Multimap<Integer, InputSplit> bucketGroupedSplitMultimap = ArrayListMultimap.<Integer, InputSplit>create();
    // use the tez grouper to combine splits once per bucket
    for (int bucketId : bucketSplitMultimap.keySet()) {
        Collection<InputSplit> inputSplitCollection = bucketSplitMultimap.get(bucketId);
        InputSplit[] rawSplits = inputSplitCollection.toArray(new InputSplit[0]);
        InputSplit[] groupedSplits = tezGrouper.getGroupedSplits(conf, rawSplits, bucketTaskMap.get(bucketId), HiveInputFormat.class.getName(), new ColumnarSplitSizeEstimator(), splitLocationProvider);
        LOG.info("Original split count is " + rawSplits.length + " grouped split count is " + groupedSplits.length + ", for bucket: " + bucketId);
        for (InputSplit inSplit : groupedSplits) {
            bucketGroupedSplitMultimap.put(bucketId, inSplit);
        }
    }
    return bucketGroupedSplitMultimap;
}
Also used : HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) InputSplit(org.apache.hadoop.mapred.InputSplit) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint)

Example 59 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class CustomPartitionVertex method processAllEvents.

private void processAllEvents(String inputName, Multimap<Integer, InputSplit> bucketToGroupedSplitMap, boolean secondLevelGroupingDone) throws IOException {
    int totalInputsCount = 0;
    List<Integer> numSplitsForTask = new ArrayList<Integer>();
    for (Entry<Integer, Collection<InputSplit>> entry : bucketToGroupedSplitMap.asMap().entrySet()) {
        int bucketNum = entry.getKey();
        Collection<InputSplit> initialSplits = entry.getValue();
        finalSplits.addAll(initialSplits);
        for (InputSplit inputSplit : initialSplits) {
            bucketToTaskMap.put(bucketNum, taskCount);
            if (secondLevelGroupingDone) {
                TezGroupedSplit groupedSplit = (TezGroupedSplit) inputSplit;
                numSplitsForTask.add(groupedSplit.getGroupedSplits().size());
                totalInputsCount += groupedSplit.getGroupedSplits().size();
            } else {
                numSplitsForTask.add(1);
                totalInputsCount += 1;
            }
            taskCount++;
        }
    }
    inputNameInputSpecMap.put(inputName, InputSpecUpdate.createPerTaskInputSpecUpdate(numSplitsForTask));
    // Construct the EdgeManager descriptor to be used by all edges which need
    // the routing table.
    EdgeManagerPluginDescriptor hiveEdgeManagerDesc = null;
    if ((vertexType == VertexType.MULTI_INPUT_INITIALIZED_EDGES) || (vertexType == VertexType.INITIALIZED_EDGES)) {
        hiveEdgeManagerDesc = EdgeManagerPluginDescriptor.create(CustomPartitionEdge.class.getName());
        UserPayload payload = getBytePayload(bucketToTaskMap);
        hiveEdgeManagerDesc.setUserPayload(payload);
    }
    // Replace the edge manager for all vertices which have routing type custom.
    for (Entry<String, EdgeProperty> edgeEntry : context.getInputVertexEdgeProperties().entrySet()) {
        if (edgeEntry.getValue().getDataMovementType() == DataMovementType.CUSTOM && edgeEntry.getValue().getEdgeManagerDescriptor().getClassName().equals(CustomPartitionEdge.class.getName())) {
            emMap.put(edgeEntry.getKey(), hiveEdgeManagerDesc);
        }
    }
    LOG.info("Task count is " + taskCount + " for input name: " + inputName);
    List<InputDataInformationEvent> taskEvents = Lists.newArrayListWithCapacity(totalInputsCount);
    // Re-serialize the splits after grouping.
    int count = 0;
    for (InputSplit inputSplit : finalSplits) {
        if (secondLevelGroupingDone) {
            TezGroupedSplit tezGroupedSplit = (TezGroupedSplit) inputSplit;
            for (InputSplit subSplit : tezGroupedSplit.getGroupedSplits()) {
                if ((subSplit instanceof TezGroupedSplit) == false) {
                    throw new IOException("Unexpected split type found: " + subSplit.getClass().getCanonicalName());
                }
                MRSplitProto serializedSplit = MRInputHelpers.createSplitProto(subSplit);
                InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count, serializedSplit.toByteString().asReadOnlyByteBuffer());
                diEvent.setTargetIndex(count);
                taskEvents.add(diEvent);
            }
        } else {
            MRSplitProto serializedSplit = MRInputHelpers.createSplitProto(inputSplit);
            InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(count, serializedSplit.toByteString().asReadOnlyByteBuffer());
            diEvent.setTargetIndex(count);
            taskEvents.add(diEvent);
        }
        count++;
    }
    // Set the actual events for the tasks.
    LOG.info("For input name: " + inputName + " task events size is " + taskEvents.size());
    context.addRootInputEvents(inputName, taskEvents);
    if (!inputToGroupedSplitMap.isEmpty()) {
        for (Entry<String, Multimap<Integer, InputSplit>> entry : inputToGroupedSplitMap.entrySet()) {
            processAllSideEvents(entry.getKey(), entry.getValue());
        }
        setVertexParallelismAndRootInputSpec(inputNameInputSpecMap);
        inputToGroupedSplitMap.clear();
    }
    // Only done when it is a bucket map join only no SMB.
    if (numInputsAffectingRootInputSpecUpdate == 1) {
        setVertexParallelismAndRootInputSpec(inputNameInputSpecMap);
        // Send the bucket IDs associated with the tasks, must happen after parallelism is set.
        sendBucketIdsToProcessor();
    }
}
Also used : UserPayload(org.apache.tez.dag.api.UserPayload) TezGroupedSplit(org.apache.hadoop.mapred.split.TezGroupedSplit) ByteString(com.google.protobuf.ByteString) IOException(java.io.IOException) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) ArrayListMultimap(com.google.common.collect.ArrayListMultimap) Multimap(com.google.common.collect.Multimap) HashMultimap(com.google.common.collect.HashMultimap) LinkedListMultimap(com.google.common.collect.LinkedListMultimap) EdgeManagerPluginDescriptor(org.apache.tez.dag.api.EdgeManagerPluginDescriptor) EdgeProperty(org.apache.tez.dag.api.EdgeProperty) InputSplit(org.apache.hadoop.mapred.InputSplit) MRSplitProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto)

Example 60 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class LlapBaseInputFormat method getSplits.

/**
 * Calling getSplits() will open a HiveServer2 connection which should be closed by the calling application
 * using LlapBaseInputFormat.close() when the application is done with the splits.
 */
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    List<InputSplit> ins = new ArrayList<InputSplit>();
    if (url == null)
        url = job.get(URL_KEY);
    if (query == null)
        query = job.get(QUERY_KEY);
    if (user == null)
        user = job.get(USER_KEY);
    if (pwd == null)
        pwd = job.get(PWD_KEY);
    String database = job.get(DB_KEY);
    if (url == null || query == null) {
        throw new IllegalStateException();
    }
    String handleId = job.get(HANDLE_ID);
    if (handleId == null) {
        handleId = UUID.randomUUID().toString();
        LOG.info("Handle ID not specified - generated handle ID {}", handleId);
    }
    try {
        Class.forName(driverName);
    } catch (ClassNotFoundException e) {
        throw new IOException(e);
    }
    LOG.info("Handle ID {}: query={}", handleId, query);
    String escapedQuery = StringUtils.escapeString(query, ESCAPE_CHAR, escapedChars);
    String sql = String.format(SPLIT_QUERY, escapedQuery, numSplits);
    try {
        Connection conn = DriverManager.getConnection(url, user, pwd);
        try (Statement stmt = conn.createStatement()) {
            if (database != null && !database.isEmpty()) {
                stmt.execute("USE " + database);
            }
            ResultSet res = stmt.executeQuery(sql);
            while (res.next()) {
                // deserialize split
                DataInput in = new DataInputStream(res.getBinaryStream(1));
                InputSplitWithLocationInfo is = new LlapInputSplit();
                is.readFields(in);
                ins.add(is);
            }
            res.close();
        } catch (Exception e) {
            LOG.error("Closing connection due to error", e);
            conn.close();
            throw e;
        }
        // Keep connection open to hang on to associated resources (temp tables, locks).
        // Save to connectionMap so it can be closed at user's convenience.
        addConnection(handleId, conn);
    } catch (Exception e) {
        throw new IOException(e);
    }
    return ins.toArray(new InputSplit[ins.size()]);
}
Also used : InputSplitWithLocationInfo(org.apache.hadoop.mapred.InputSplitWithLocationInfo) Statement(java.sql.Statement) ArrayList(java.util.ArrayList) Connection(java.sql.Connection) ByteString(com.google.protobuf.ByteString) IOException(java.io.IOException) DataInputStream(java.io.DataInputStream) IOException(java.io.IOException) DataInput(java.io.DataInput) ResultSet(java.sql.ResultSet) InputSplit(org.apache.hadoop.mapred.InputSplit)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)161 Path (org.apache.hadoop.fs.Path)57 JobConf (org.apache.hadoop.mapred.JobConf)56 Test (org.junit.Test)49 IOException (java.io.IOException)47 ArrayList (java.util.ArrayList)29 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)27 FileSplit (org.apache.hadoop.mapred.FileSplit)24 FileSystem (org.apache.hadoop.fs.FileSystem)21 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)21 InputFormat (org.apache.hadoop.mapred.InputFormat)19 RecordWriter (org.apache.hadoop.mapred.RecordWriter)19 NullWritable (org.apache.hadoop.io.NullWritable)18 Text (org.apache.hadoop.io.Text)18 Configuration (org.apache.hadoop.conf.Configuration)14 LongWritable (org.apache.hadoop.io.LongWritable)11 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)10 Properties (java.util.Properties)9 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)9 HashMap (java.util.HashMap)8