Examples with FileSplit - org.apache.hadoop.mapred.FileSplit

Example 76 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class CustomPartitionVertex method onRootVertexInitialized.

// One call per root Input
@Override
public void onRootVertexInitialized(String inputName, InputDescriptor inputDescriptor, List<Event> events) {
    numInputsSeenSoFar++;
    LOG.info("On root vertex initialized " + inputName);
    try {
        // This is using the payload from the RootVertexInitializer corresponding
        // to InputName. Ideally it should be using it's own configuration class -
        // but that
        // means serializing another instance.
        MRInputUserPayloadProto protoPayload = MRInputHelpers.parseMRInputPayload(inputDescriptor.getUserPayload());
        this.conf = TezUtils.createConfFromByteString(protoPayload.getConfigurationBytes());
        /*
       * Currently in tez, the flow of events is thus:
       * "Generate Splits -> Initialize Vertex" (with parallelism info obtained
       * from the generate splits phase). The generate splits phase groups
       * splits using the TezGroupedSplitsInputFormat. However, for bucket map
       * joins the grouping done by this input format results in incorrect
       * results as the grouper has no knowledge of buckets. So, we initially
       * set the input format to be HiveInputFormat (in DagUtils) for the case
       * of bucket map joins so as to obtain un-grouped splits. We then group
       * the splits corresponding to buckets using the tez grouper which returns
       * TezGroupedSplits.
       */
        // This assumes that Grouping will always be used.
        // Enabling grouping on the payload.
        MRInputUserPayloadProto updatedPayload = MRInputUserPayloadProto.newBuilder(protoPayload).setGroupingEnabled(true).build();
        inputDescriptor.setUserPayload(UserPayload.create(updatedPayload.toByteString().asReadOnlyByteBuffer()));
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    boolean dataInformationEventSeen = false;
    Map<String, Set<FileSplit>> pathFileSplitsMap = new TreeMap<String, Set<FileSplit>>();
    for (Event event : events) {
        if (event instanceof InputConfigureVertexTasksEvent) {
            // No tasks should have been started yet. Checked by initial state
            // check.
            LOG.info("Got a input configure vertex event for input: " + inputName);
            Preconditions.checkState(dataInformationEventSeen == false);
            InputConfigureVertexTasksEvent cEvent = (InputConfigureVertexTasksEvent) event;
            // The vertex cannot be configured until all DataEvents are seen - to
            // build the routing table.
            configureVertexTaskEvent = cEvent;
            LOG.info("Configure task for input name: " + inputName + " num tasks: " + configureVertexTaskEvent.getNumTasks());
        }
        if (event instanceof InputUpdatePayloadEvent) {
            // this event can never occur. If it does, fail.
            Preconditions.checkState(false);
        } else if (event instanceof InputDataInformationEvent) {
            dataInformationEventSeen = true;
            InputDataInformationEvent diEvent = (InputDataInformationEvent) event;
            FileSplit fileSplit;
            try {
                fileSplit = getFileSplitFromEvent(diEvent);
            } catch (IOException e) {
                throw new RuntimeException("Failed to get file split for event: " + diEvent, e);
            }
            Set<FileSplit> fsList = pathFileSplitsMap.get(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()));
            if (fsList == null) {
                fsList = new TreeSet<FileSplit>(new PathComparatorForSplit());
                pathFileSplitsMap.put(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()), fsList);
            }
            fsList.add(fileSplit);
        }
    }
    LOG.debug("Path file splits map for input name: {} is {}", inputName, pathFileSplitsMap);
    Multimap<Integer, InputSplit> bucketToInitialSplitMap = getBucketSplitMapForPath(inputName, pathFileSplitsMap);
    try {
        int totalResource = context.getTotalAvailableResource().getMemory();
        int taskResource = context.getVertexTaskResource().getMemory();
        float waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
        int availableSlots = totalResource / taskResource;
        LOG.debug("Grouping splits. {} available slots, {} waves. Bucket initial splits map: {}", availableSlots, waves, bucketToInitialSplitMap);
        JobConf jobConf = new JobConf(conf);
        ShimLoader.getHadoopShims().getMergedCredentials(jobConf);
        Multimap<Integer, InputSplit> bucketToGroupedSplitMap = HashMultimap.<Integer, InputSplit>create();
        boolean secondLevelGroupingDone = false;
        if ((mainWorkName.isEmpty()) || (inputName.compareTo(mainWorkName) == 0)) {
            SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
            for (Integer key : bucketToInitialSplitMap.keySet()) {
                InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
                Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, mainWorkName.isEmpty(), splitLocationProvider);
                if (mainWorkName.isEmpty() == false) {
                    Multimap<Integer, InputSplit> singleBucketToGroupedSplit = HashMultimap.<Integer, InputSplit>create();
                    singleBucketToGroupedSplit.putAll(key, groupedSplit.values());
                    groupedSplit = grouper.group(jobConf, singleBucketToGroupedSplit, availableSlots, HiveConf.getFloatVar(conf, HiveConf.ConfVars.TEZ_SMB_NUMBER_WAVES), null);
                    secondLevelGroupingDone = true;
                }
                bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
            }
            processAllEvents(inputName, bucketToGroupedSplitMap, secondLevelGroupingDone);
        } else {
            SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
            // all the bucket files.
            for (Integer key : bucketToInitialSplitMap.keySet()) {
                InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
                Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, false, splitLocationProvider);
                bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
            }
            /*
         * this is the small table side. In case of SMB join, we need to send each split to the
         * corresponding bucket-based task on the other side. In case a split needs to go to
         * multiple downstream tasks, we need to clone the event and send it to the right
         * destination.
         */
            LOG.info("This is the side work - multi-mr work.");
            processAllSideEventsSetParallelism(inputName, bucketToGroupedSplitMap);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

Also used : IOException(java.io.IOException) ByteString(com.google.protobuf.ByteString) FileSplit(org.apache.hadoop.mapred.FileSplit) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) IOException(java.io.IOException) MRInputUserPayloadProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto) Event(org.apache.tez.runtime.api.Event) SplitLocationProvider(org.apache.hadoop.mapred.split.SplitLocationProvider) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf)

Example 77 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class HiveSplitGenerator method initialize.

@SuppressWarnings("unchecked")
@Override
public List<Event> initialize() throws Exception {
    if (getContext() != null) {
        // called from Tez AM.
        prepare(getContext());
    }
    // Setup the map work for this thread. Pruning modified the work instance to potentially remove
    // partitions. The same work instance must be used when generating splits.
    Utilities.setMapWork(jobConf, work);
    try {
        boolean sendSerializedEvents = conf.getBoolean("mapreduce.tez.input.initializer.serialize.event.payload", true);
        // perform dynamic partition pruning
        if (pruner != null) {
            pruner.initialize(getContext(), work, jobConf);
            pruner.prune();
        }
        InputSplitInfoMem inputSplitInfo = null;
        boolean generateConsistentSplits = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS);
        LOG.info("GenerateConsistentSplitsInHive=" + generateConsistentSplits);
        String realInputFormatName = conf.get("mapred.input.format.class");
        boolean groupingEnabled = userPayloadProto.getGroupingEnabled();
        if (groupingEnabled) {
            // Need to instantiate the realInputFormat
            InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) ReflectionUtils.newInstance(JavaUtils.loadClass(realInputFormatName), jobConf);
            int totalResource = 0;
            int taskResource = 0;
            int availableSlots = 0;
            // FIXME. Do the right thing Luke.
            if (getContext() == null) {
                // for now, totalResource = taskResource for llap
                availableSlots = 1;
            }
            if (getContext() != null) {
                totalResource = getContext().getTotalAvailableResource().getMemory();
                taskResource = getContext().getVertexTaskResource().getMemory();
                availableSlots = totalResource / taskResource;
            }
            if (HiveConf.getLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1) <= 1) {
                // broken configuration from mapred-default.xml
                final long blockSize = conf.getLongBytes(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT);
                final long minGrouping = conf.getLong(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE_DEFAULT);
                final long preferredSplitSize = Math.min(blockSize / 2, minGrouping);
                HiveConf.setLongVar(jobConf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, preferredSplitSize);
                LOG.info("The preferred split size is " + preferredSplitSize);
            }
            float waves;
            // Create the un-grouped splits
            if (numSplits.isPresent()) {
                waves = numSplits.get().floatValue() / availableSlots;
            } else {
                waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
            }
            InputSplit[] splits;
            if (generateSingleSplit && conf.get(HiveConf.ConfVars.HIVETEZINPUTFORMAT.varname).equals(HiveInputFormat.class.getName())) {
                MapWork mapWork = Utilities.getMapWork(jobConf);
                List<Path> paths = Utilities.getInputPathsTez(jobConf, mapWork);
                FileSystem fs = paths.get(0).getFileSystem(jobConf);
                FileStatus[] fileStatuses = fs.listStatus(paths.get(0));
                if (fileStatuses.length == 0) {
                    // generate single split typically happens when reading data out of order by queries.
                    // if order by query returns no rows, no files will exists in input path
                    splits = new InputSplit[0];
                } else {
                    // if files exists in input path then it has to be 1 as this code path gets triggered only
                    // of order by queries which is expected to write only one file (written by one reducer)
                    Preconditions.checkState(paths.size() == 1 && fileStatuses.length == 1 && mapWork.getAliasToPartnInfo().size() == 1, "Requested to generate single split. Paths and fileStatuses are expected to be 1. " + "Got paths: " + paths.size() + " fileStatuses: " + fileStatuses.length);
                    splits = new InputSplit[1];
                    FileStatus fileStatus = fileStatuses[0];
                    BlockLocation[] locations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
                    Set<String> hostsSet = new HashSet<>();
                    for (BlockLocation location : locations) {
                        hostsSet.addAll(Lists.newArrayList(location.getHosts()));
                    }
                    String[] hosts = hostsSet.toArray(new String[0]);
                    FileSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, fileStatus.getLen(), hosts);
                    String alias = mapWork.getAliases().get(0);
                    PartitionDesc partDesc = mapWork.getAliasToPartnInfo().get(alias);
                    String partIF = partDesc.getInputFileFormatClassName();
                    splits[0] = new HiveInputFormat.HiveInputSplit(fileSplit, partIF);
                }
            } else {
                // Raw splits
                splits = inputFormat.getSplits(jobConf, numSplits.orElse(Math.multiplyExact(availableSlots, (int) waves)));
            }
            // Sort the splits, so that subsequent grouping is consistent.
            Arrays.sort(splits, new InputSplitComparator());
            LOG.info("Number of input splits: " + splits.length + ". " + availableSlots + " available slots, " + waves + " waves. Input format is: " + realInputFormatName);
            // increment/set input counters
            InputInitializerContext inputInitializerContext = getContext();
            TezCounters tezCounters = null;
            String counterName;
            String groupName = null;
            String vertexName = null;
            if (inputInitializerContext != null) {
                try {
                    tezCounters = new TezCounters();
                    groupName = HiveInputCounters.class.getName();
                    vertexName = jobConf.get(Operator.CONTEXT_NAME_KEY, "");
                    counterName = Utilities.getVertexCounterName(HiveInputCounters.RAW_INPUT_SPLITS.name(), vertexName);
                    tezCounters.findCounter(groupName, counterName).increment(splits.length);
                    final List<Path> paths = Utilities.getInputPathsTez(jobConf, work);
                    counterName = Utilities.getVertexCounterName(HiveInputCounters.INPUT_DIRECTORIES.name(), vertexName);
                    tezCounters.findCounter(groupName, counterName).increment(paths.size());
                    final Set<String> files = new HashSet<>();
                    for (InputSplit inputSplit : splits) {
                        if (inputSplit instanceof FileSplit) {
                            final FileSplit fileSplit = (FileSplit) inputSplit;
                            final Path path = fileSplit.getPath();
                            // The assumption here is the path is a file. Only case this is different is ACID deltas.
                            // The isFile check is avoided here for performance reasons.
                            final String fileStr = path.toString();
                            if (!files.contains(fileStr)) {
                                files.add(fileStr);
                            }
                        }
                    }
                    counterName = Utilities.getVertexCounterName(HiveInputCounters.INPUT_FILES.name(), vertexName);
                    tezCounters.findCounter(groupName, counterName).increment(files.size());
                } catch (Exception e) {
                    LOG.warn("Caught exception while trying to update Tez counters", e);
                }
            }
            if (work.getIncludedBuckets() != null) {
                splits = pruneBuckets(work, splits);
            }
            Multimap<Integer, InputSplit> groupedSplits = splitGrouper.generateGroupedSplits(jobConf, conf, splits, waves, availableSlots, splitLocationProvider);
            // And finally return them in a flat array
            InputSplit[] flatSplits = groupedSplits.values().toArray(new InputSplit[0]);
            LOG.info("Number of split groups: " + flatSplits.length);
            if (inputInitializerContext != null) {
                try {
                    counterName = Utilities.getVertexCounterName(HiveInputCounters.GROUPED_INPUT_SPLITS.name(), vertexName);
                    tezCounters.findCounter(groupName, counterName).setValue(flatSplits.length);
                    LOG.debug("Published tez counters: {}", tezCounters);
                    inputInitializerContext.addCounters(tezCounters);
                } catch (Exception e) {
                    LOG.warn("Caught exception while trying to update Tez counters", e);
                }
            }
            List<TaskLocationHint> locationHints = splitGrouper.createTaskLocationHints(flatSplits, generateConsistentSplits);
            inputSplitInfo = new InputSplitInfoMem(flatSplits, locationHints, flatSplits.length, null, jobConf);
        } else {
            // If this is used in the future - make sure to disable grouping in the payload, if it isn't already disabled
            throw new RuntimeException("HiveInputFormat does not support non-grouped splits, InputFormatName is: " + realInputFormatName);
        // inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(jobConf, false, 0);
        }
        return createEventList(sendSerializedEvents, inputSplitInfo);
    } finally {
        Utilities.clearWork(jobConf);
    }
}

Also used : FileStatus(org.apache.hadoop.fs.FileStatus) BlockLocation(org.apache.hadoop.fs.BlockLocation) FileSplit(org.apache.hadoop.mapred.FileSplit) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) InputSplitInfoMem(org.apache.tez.mapreduce.hadoop.InputSplitInfoMem) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapred.InputSplit) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) InputInitializerContext(org.apache.tez.runtime.api.InputInitializerContext) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) TezCounters(org.apache.tez.common.counters.TezCounters) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) InputFormat(org.apache.hadoop.mapred.InputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc)

Example 78 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class HostAffinitySplitLocationProvider method getLocations.

@Override
public String[] getLocations(InputSplit split) throws IOException {
    if (!(split instanceof FileSplit)) {
        LOG.debug("Split: {} is not a FileSplit. Using default locations", split);
        return split.getLocations();
    }
    FileSplit fsplit = (FileSplit) split;
    String location = locations.get(determineLocation(locations, fsplit));
    return (location != null) ? new String[] { location } : null;
}

Also used : FileSplit(org.apache.hadoop.mapred.FileSplit)

Example 79 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class Base64TextInputFormat method getRecordReader.

public RecordReader<LongWritable, BytesWritable> getRecordReader(InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException {
    reporter.setStatus(genericSplit.toString());
    Base64LineRecordReader reader = new Base64LineRecordReader(new LineRecordReader(job, (FileSplit) genericSplit));
    reader.configure(job);
    return reader;
}

Also used : LineRecordReader(org.apache.hadoop.mapred.LineRecordReader) FileSplit(org.apache.hadoop.mapred.FileSplit)

Example 80 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hive by apache.

the class RCFileCat method run.

@Override
public int run(String[] args) throws Exception {
    long start = 0l;
    long length = -1l;
    int recordCount = 0;
    long startT = System.currentTimeMillis();
    boolean verbose = false;
    boolean columnSizes = false;
    boolean pretty = false;
    boolean fileSizes = false;
    // get options from arguments
    if (args.length < 1 || args.length > 3) {
        printUsage(null);
        return -1;
    }
    Path fileName = null;
    for (int i = 0; i < args.length; i++) {
        String arg = args[i];
        if (arg.startsWith("--start=")) {
            start = Long.parseLong(arg.substring("--start=".length()));
        } else if (arg.startsWith("--length=")) {
            length = Long.parseLong(arg.substring("--length=".length()));
        } else if (arg.equals("--verbose")) {
            verbose = true;
        } else if (arg.equals("--column-sizes")) {
            columnSizes = true;
        } else if (arg.equals("--column-sizes-pretty")) {
            columnSizes = true;
            pretty = true;
        } else if (arg.equals("--file-sizes")) {
            fileSizes = true;
        } else if (fileName == null) {
            fileName = new Path(arg);
        } else {
            printUsage(null);
            return -1;
        }
    }
    setupBufferedOutput();
    FileSystem fs = FileSystem.get(fileName.toUri(), conf);
    long fileLen = fs.getFileStatus(fileName).getLen();
    if (start < 0) {
        start = 0;
    }
    if (start > fileLen) {
        return 0;
    }
    if (length < 0 || (start + length) > fileLen) {
        length = fileLen - start;
    }
    // share the code with RecordReader.
    FileSplit split = new FileSplit(fileName, start, length, new JobConf(conf));
    RCFileRecordReader recordReader = new RCFileRecordReader(conf, split);
    if (columnSizes || fileSizes) {
        // Print out the un/compressed sizes of each column
        long[] compressedColumnSizes = null;
        long[] uncompressedColumnSizes = null;
        // un/compressed sizes of file and no. of rows
        long rowNo = 0;
        long uncompressedFileSize = 0;
        long compressedFileSize = 0;
        // Skip from block to block since we only need the header
        while (recordReader.nextBlock()) {
            // Get the sizes from the key buffer and aggregate
            KeyBuffer keyBuffer = recordReader.getKeyBuffer();
            if (uncompressedColumnSizes == null) {
                uncompressedColumnSizes = new long[keyBuffer.getColumnNumber()];
            }
            if (compressedColumnSizes == null) {
                compressedColumnSizes = new long[keyBuffer.getColumnNumber()];
            }
            for (int i = 0; i < keyBuffer.getColumnNumber(); i++) {
                uncompressedColumnSizes[i] += keyBuffer.getEachColumnUncompressedValueLen()[i];
                compressedColumnSizes[i] += keyBuffer.getEachColumnValueLen()[i];
            }
            rowNo += keyBuffer.getNumberRows();
        }
        if (columnSizes && uncompressedColumnSizes != null && compressedColumnSizes != null) {
            // otherwise print it out as if it were a row
            for (int i = 0; i < uncompressedColumnSizes.length; i++) {
                if (pretty) {
                    System.out.println("Column " + i + ": Uncompressed size: " + uncompressedColumnSizes[i] + " Compressed size: " + compressedColumnSizes[i]);
                } else {
                    System.out.print(i + TAB + uncompressedColumnSizes[i] + TAB + compressedColumnSizes[i] + NEWLINE);
                }
            }
        }
        if (fileSizes) {
            if (uncompressedColumnSizes != null && compressedColumnSizes != null) {
                for (int i = 0; i < uncompressedColumnSizes.length; i++) {
                    uncompressedFileSize += uncompressedColumnSizes[i];
                    compressedFileSize += compressedColumnSizes[i];
                }
            }
            System.out.print("File size (uncompressed): " + uncompressedFileSize + ". File size (compressed): " + compressedFileSize + ". Number of rows: " + rowNo + "." + NEWLINE);
        }
        System.out.flush();
        return 0;
    }
    LongWritable key = new LongWritable();
    BytesRefArrayWritable value = new BytesRefArrayWritable();
    // extra capacity in case we overrun, to avoid resizing
    StringBuilder buf = new StringBuilder(STRING_BUFFER_SIZE);
    while (recordReader.next(key, value)) {
        printRecord(value, buf);
        recordCount++;
        if (verbose && (recordCount % RECORD_PRINT_INTERVAL) == 0) {
            long now = System.currentTimeMillis();
            System.err.println("Read " + recordCount / 1024 + "k records");
            System.err.println("Read " + ((recordReader.getPos() / (1024L * 1024L))) + "MB");
            System.err.printf("Input scan rate %.2f MB/s\n", (recordReader.getPos() * 1.0 / (now - startT)) / 1024.0);
        }
        if (buf.length() > STRING_BUFFER_FLUSH_SIZE) {
            System.out.print(buf.toString());
            buf.setLength(0);
        }
    }
    // print out last part of buffer
    System.out.print(buf.toString());
    System.out.flush();
    return 0;
}

Also used : Path(org.apache.hadoop.fs.Path) BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) RCFileRecordReader(org.apache.hadoop.hive.ql.io.RCFileRecordReader) KeyBuffer(org.apache.hadoop.hive.ql.io.RCFile.KeyBuffer) FileSplit(org.apache.hadoop.mapred.FileSplit) FileSystem(org.apache.hadoop.fs.FileSystem) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

FileSplit (org.apache.hadoop.mapred.FileSplit)101 Path (org.apache.hadoop.fs.Path)57 InputSplit (org.apache.hadoop.mapred.InputSplit)34 JobConf (org.apache.hadoop.mapred.JobConf)25 IOException (java.io.IOException)19 Configuration (org.apache.hadoop.conf.Configuration)17 File (java.io.File)16 FileStatus (org.apache.hadoop.fs.FileStatus)13 FileSystem (org.apache.hadoop.fs.FileSystem)13 Test (org.junit.Test)12 ArrayList (java.util.ArrayList)10 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)10 Properties (java.util.Properties)9 RecordReader (org.apache.hadoop.mapred.RecordReader)9 Test (org.testng.annotations.Test)9 List (java.util.List)8 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)8 RecordCursor (com.facebook.presto.spi.RecordCursor)6 ImmutableList (com.google.common.collect.ImmutableList)6 Iterables.filter (com.google.common.collect.Iterables.filter)6