Examples with InputSplit - org.apache.hadoop.mapred.InputSplit

Example 46 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class HiveAccumuloTableInputFormat method getSplits.

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
    final AccumuloConnectionParameters accumuloParams = new AccumuloConnectionParameters(jobConf);
    final Instance instance = accumuloParams.getInstance();
    final ColumnMapper columnMapper;
    try {
        columnMapper = getColumnMapper(jobConf);
    } catch (TooManyAccumuloColumnsException e) {
        throw new IOException(e);
    }
    JobContext context = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(jobConf));
    Path[] tablePaths = FileInputFormat.getInputPaths(context);
    try {
        Connector connector = null;
        // Need to get a Connector so we look up the user's authorizations if not otherwise specified
        if (accumuloParams.useSasl()) {
            log.info("Current user: " + UserGroupInformation.getCurrentUser());
            // In a YARN/Tez job, don't have the Kerberos credentials anymore, use the delegation token
            AuthenticationToken token = ConfiguratorBase.getAuthenticationToken(AccumuloInputFormat.class, jobConf);
            if (null != token && !jobConf.getCredentials().getAllTokens().isEmpty()) {
                // Convert the stub from the configuration back into a normal Token
                log.info("Found authentication token in Configuration: " + token);
                log.info("Job credential tokens: " + jobConf.getCredentials().getAllTokens());
                AuthenticationToken unwrappedToken = ConfiguratorBase.unwrapAuthenticationToken(jobConf, token);
                log.info("Converted authentication token from Configuration into: " + unwrappedToken);
                // will return back the original token (which we know is insufficient)
                if (unwrappedToken != token) {
                    log.info("Creating Accumulo Connector with unwrapped delegation token");
                    connector = instance.getConnector(accumuloParams.getAccumuloUserName(), unwrappedToken);
                } else {
                    log.info("Job credentials did not contain delegation token, fetching new token");
                }
            }
            if (connector == null) {
                log.info("Obtaining Accumulo Connector using KerberosToken");
                // Construct a KerberosToken -- relies on ProxyUser configuration. Will be the client making
                // the request on top of the HS2's user. Accumulo will require proper proxy-user auth configs.
                connector = instance.getConnector(accumuloParams.getAccumuloUserName(), new KerberosToken(accumuloParams.getAccumuloUserName()));
            }
        } else {
            // Still in the local JVM, use the username+password or Kerberos credentials
            connector = accumuloParams.getConnector(instance);
        }
        final List<ColumnMapping> columnMappings = columnMapper.getColumnMappings();
        final List<IteratorSetting> iterators = predicateHandler.getIterators(jobConf, columnMapper);
        final Collection<Range> ranges = predicateHandler.getRanges(jobConf, columnMapper);
        // We don't want that.
        if (null != ranges && ranges.isEmpty()) {
            return new InputSplit[0];
        }
        // Set the relevant information in the Configuration for the AccumuloInputFormat
        configure(jobConf, instance, connector, accumuloParams, columnMapper, iterators, ranges);
        int numColumns = columnMappings.size();
        List<Integer> readColIds = ColumnProjectionUtils.getReadColumnIDs(jobConf);
        // Sanity check
        if (numColumns < readColIds.size())
            throw new IOException("Number of column mappings (" + numColumns + ")" + " numbers less than the hive table columns. (" + readColIds.size() + ")");
        // get splits from Accumulo
        InputSplit[] splits = accumuloInputFormat.getSplits(jobConf, numSplits);
        HiveAccumuloSplit[] hiveSplits = new HiveAccumuloSplit[splits.length];
        for (int i = 0; i < splits.length; i++) {
            RangeInputSplit ris = (RangeInputSplit) splits[i];
            ris.setLogLevel(Level.DEBUG);
            hiveSplits[i] = new HiveAccumuloSplit(ris, tablePaths[0]);
        }
        return hiveSplits;
    } catch (AccumuloException e) {
        log.error("Could not configure AccumuloInputFormat", e);
        throw new IOException(StringUtils.stringifyException(e));
    } catch (AccumuloSecurityException e) {
        log.error("Could not configure AccumuloInputFormat", e);
        throw new IOException(StringUtils.stringifyException(e));
    } catch (SerDeException e) {
        log.error("Could not configure AccumuloInputFormat", e);
        throw new IOException(StringUtils.stringifyException(e));
    }
}

Also used : Connector(org.apache.accumulo.core.client.Connector) AuthenticationToken(org.apache.accumulo.core.client.security.tokens.AuthenticationToken) MockInstance(org.apache.accumulo.core.client.mock.MockInstance) Instance(org.apache.accumulo.core.client.Instance) RangeInputSplit(org.apache.accumulo.core.client.mapred.RangeInputSplit) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) JobContext(org.apache.hadoop.mapreduce.JobContext) RangeInputSplit(org.apache.accumulo.core.client.mapred.RangeInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) HiveAccumuloMapColumnMapping(org.apache.hadoop.hive.accumulo.columns.HiveAccumuloMapColumnMapping) ColumnMapping(org.apache.hadoop.hive.accumulo.columns.ColumnMapping) HiveAccumuloColumnMapping(org.apache.hadoop.hive.accumulo.columns.HiveAccumuloColumnMapping) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) Path(org.apache.hadoop.fs.Path) AccumuloException(org.apache.accumulo.core.client.AccumuloException) KerberosToken(org.apache.accumulo.core.client.security.tokens.KerberosToken) IOException(java.io.IOException) TooManyAccumuloColumnsException(org.apache.hadoop.hive.accumulo.serde.TooManyAccumuloColumnsException) Range(org.apache.accumulo.core.data.Range) IteratorSetting(org.apache.accumulo.core.client.IteratorSetting) AccumuloConnectionParameters(org.apache.hadoop.hive.accumulo.AccumuloConnectionParameters) ColumnMapper(org.apache.hadoop.hive.accumulo.columns.ColumnMapper)

Example 47 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class StreamingAssert method readRecords.

/**
 * TODO: this would be more flexible doing a SQL select statement rather than using InputFormat directly
 * see {@link org.apache.hive.hcatalog.streaming.TestStreaming#checkDataWritten2(Path, long, long, int, String, String...)}
 * @param numSplitsExpected
 * @return
 * @throws Exception
 */
List<Record> readRecords(int numSplitsExpected) throws Exception {
    if (currentDeltas.isEmpty()) {
        throw new AssertionError("No data");
    }
    InputFormat<NullWritable, OrcStruct> inputFormat = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.input.dir", partitionLocation.toString());
    job.set(hive_metastoreConstants.BUCKET_COUNT, Integer.toString(table.getSd().getNumBuckets()));
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg");
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string");
    AcidUtils.setAcidOperationalProperties(job, true, null);
    job.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
    job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.toString());
    InputSplit[] splits = inputFormat.getSplits(job, 1);
    assertEquals(numSplitsExpected, splits.length);
    List<Record> records = new ArrayList<>();
    for (InputSplit is : splits) {
        final AcidRecordReader<NullWritable, OrcStruct> recordReader = (AcidRecordReader<NullWritable, OrcStruct>) inputFormat.getRecordReader(is, job, Reporter.NULL);
        NullWritable key = recordReader.createKey();
        OrcStruct value = recordReader.createValue();
        while (recordReader.next(key, value)) {
            RecordIdentifier recordIdentifier = recordReader.getRecordIdentifier();
            Record record = new Record(new RecordIdentifier(recordIdentifier.getWriteId(), recordIdentifier.getBucketProperty(), recordIdentifier.getRowId()), value.toString());
            System.out.println(record);
            records.add(record);
        }
        recordReader.close();
    }
    return records;
}

Also used : ArrayList(java.util.ArrayList) AcidRecordReader(org.apache.hadoop.hive.ql.io.AcidInputFormat.AcidRecordReader) NullWritable(org.apache.hadoop.io.NullWritable) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) OrcStruct(org.apache.hadoop.hive.ql.io.orc.OrcStruct) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 48 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class CombineHiveInputFormat method getCombineSplits.

/**
 * Create Hive splits based on CombineFileSplit.
 */
private InputSplit[] getCombineSplits(JobConf job, int numSplits, Map<Path, PartitionDesc> pathToPartitionInfo) throws IOException {
    init(job);
    Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<String, Operator<? extends OperatorDesc>> aliasToWork = mrwork.getAliasToWork();
    CombineFileInputFormatShim combine = ShimLoader.getHadoopShims().getCombineFileInputFormat();
    InputSplit[] splits = null;
    if (combine == null) {
        splits = super.getSplits(job, numSplits);
        return splits;
    }
    if (combine.getInputPathsShim(job).length == 0) {
        throw new IOException("No input paths specified in job");
    }
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();
    // combine splits only from same tables and same partitions. Do not combine splits from multiple
    // tables or multiple partitions.
    Path[] paths = StringInternUtils.internUriStringsInPathArray(combine.getInputPathsShim(job));
    List<Path> inpDirs = new ArrayList<Path>();
    List<Path> inpFiles = new ArrayList<Path>();
    Map<CombinePathInputFormat, CombineFilter> poolMap = new HashMap<CombinePathInputFormat, CombineFilter>();
    Set<Path> poolSet = new HashSet<Path>();
    LockedDriverState lDrvStat = LockedDriverState.getLockedDriverState();
    for (Path path : paths) {
        if (lDrvStat != null && lDrvStat.isAborted()) {
            throw new IOException("Operation is Canceled. ");
        }
        PartitionDesc part = HiveFileFormatUtils.getFromPathRecursively(pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
        TableDesc tableDesc = part.getTableDesc();
        if ((tableDesc != null) && tableDesc.isNonNative()) {
            return super.getSplits(job, numSplits);
        }
        // Use HiveInputFormat if any of the paths is not splittable
        Class inputFormatClass = part.getInputFileFormatClass();
        String inputFormatClassName = inputFormatClass.getName();
        InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
        String deserializerClassName = null;
        try {
            deserializerClassName = part.getDeserializer(job).getClass().getName();
        } catch (Exception e) {
        // ignore
        }
        FileSystem inpFs = path.getFileSystem(job);
        // don't combine if inputformat is a SymlinkTextInputFormat
        if (inputFormat instanceof SymlinkTextInputFormat) {
            splits = super.getSplits(job, numSplits);
            return splits;
        }
        Path filterPath = path;
        // Does a pool exist for this path already
        CombineFilter f = null;
        List<Operator<? extends OperatorDesc>> opList = null;
        if (!mrwork.isMapperCannotSpanPartns()) {
            // if mapper can span partitions, make sure a splits does not contain multiple
            // opList + inputFormatClassName + deserializerClassName combination
            // This is done using the Map of CombinePathInputFormat to PathFilter
            opList = HiveFileFormatUtils.doGetWorksFromPath(pathToAliases, aliasToWork, filterPath);
            CombinePathInputFormat combinePathInputFormat = new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
            f = poolMap.get(combinePathInputFormat);
            if (f == null) {
                f = new CombineFilter(filterPath);
                LOG.info("CombineHiveInputSplit creating pool for " + path + "; using filter path " + filterPath);
                combine.createPool(job, f);
                poolMap.put(combinePathInputFormat, f);
            } else {
                LOG.debug("CombineHiveInputSplit: pool is already created for " + path + "; using filter path " + filterPath);
                f.addPath(filterPath);
            }
        } else {
            // but won't cross multiple partitions if the user has asked so.
            if (!path.getFileSystem(job).getFileStatus(path).isDir()) {
                // path is not directory
                filterPath = path.getParent();
                inpFiles.add(path);
                poolSet.add(filterPath);
            } else {
                inpDirs.add(path);
            }
        }
    }
    // Processing directories
    List<CombineFileSplit> iss = new ArrayList<CombineFileSplit>();
    if (!mrwork.isMapperCannotSpanPartns()) {
        // mapper can span partitions
        // combine into as few as one split, subject to the PathFilters set
        // using combine.createPool.
        iss = Arrays.asList(combine.getSplits(job, 1));
    } else {
        for (Path path : inpDirs) {
            processPaths(job, combine, iss, path);
        }
        if (inpFiles.size() > 0) {
            // Processing files
            for (Path filterPath : poolSet) {
                combine.createPool(job, new CombineFilter(filterPath));
            }
            processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
        }
    }
    if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
        iss = sampleSplits(iss);
    }
    for (CombineFileSplit is : iss) {
        CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is, pathToPartitionInfo);
        result.add(csplit);
    }
    LOG.info("number of splits " + result.size());
    return result.toArray(new InputSplit[result.size()]);
}

Also used : Operator(org.apache.hadoop.hive.ql.exec.Operator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CombineFileSplit(org.apache.hadoop.mapred.lib.CombineFileSplit) CombineFileInputFormatShim(org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapred.InputSplit) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) LockedDriverState(org.apache.hadoop.hive.ql.Driver.LockedDriverState) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 49 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class HiveInputFormat method addSplitsForGroup.

/*
   * AddSplitsForGroup collects separate calls to setInputPaths into one where possible.
   * The reason for this is that this is faster on some InputFormats. E.g.: Orc will start
   * a threadpool to do the work and calling it multiple times unnecessarily will create a lot
   * of unnecessary thread pools.
   */
private void addSplitsForGroup(List<Path> dirs, TableScanOperator tableScan, JobConf conf, InputFormat inputFormat, Class<? extends InputFormat> inputFormatClass, int splits, TableDesc table, List<InputSplit> result) throws IOException {
    ValidWriteIdList validWriteIdList = AcidUtils.getTableValidWriteIdList(conf, table.getTableName());
    ValidWriteIdList validMmWriteIdList;
    if (AcidUtils.isInsertOnlyTable(table.getProperties())) {
        if (validWriteIdList == null) {
            throw new IOException("Insert-Only table: " + table.getTableName() + " is missing from the ValidWriteIdList config: " + conf.get(ValidTxnWriteIdList.VALID_TABLES_WRITEIDS_KEY));
        }
        validMmWriteIdList = validWriteIdList;
    } else {
        // for non-MM case
        validMmWriteIdList = null;
    }
    try {
        Utilities.copyTablePropertiesToConf(table, conf);
        if (tableScan != null) {
            AcidUtils.setAcidOperationalProperties(conf, tableScan.getConf().isTranscationalTable(), tableScan.getConf().getAcidOperationalProperties());
            if (tableScan.getConf().isTranscationalTable() && (validWriteIdList == null)) {
                throw new IOException("Acid table: " + table.getTableName() + " is missing from the ValidWriteIdList config: " + conf.get(ValidTxnWriteIdList.VALID_TABLES_WRITEIDS_KEY));
            }
            if (validWriteIdList != null) {
                AcidUtils.setValidWriteIdList(conf, validWriteIdList);
            }
        }
    } catch (HiveException e) {
        throw new IOException(e);
    }
    if (tableScan != null) {
        pushFilters(conf, tableScan, this.mrwork);
    }
    Path[] finalDirs = processPathsForMmRead(dirs, conf, validMmWriteIdList);
    if (finalDirs == null) {
        // No valid inputs.
        return;
    }
    FileInputFormat.setInputPaths(conf, finalDirs);
    conf.setInputFormat(inputFormat.getClass());
    int headerCount = 0;
    int footerCount = 0;
    if (table != null) {
        headerCount = Utilities.getHeaderCount(table);
        footerCount = Utilities.getFooterCount(table, conf);
        if (headerCount != 0 || footerCount != 0) {
            // Input file has header or footer, cannot be splitted.
            HiveConf.setLongVar(conf, ConfVars.MAPREDMINSPLITSIZE, Long.MAX_VALUE);
        }
    }
    InputSplit[] iss = inputFormat.getSplits(conf, splits);
    for (InputSplit is : iss) {
        result.add(new HiveInputSplit(is, inputFormatClass.getName()));
    }
    if (iss.length == 0 && finalDirs.length > 0 && conf.getBoolean(Utilities.ENSURE_OPERATORS_EXECUTED, false)) {
        // If there are no inputs; the Execution engine skips the operator tree.
        // To prevent it from happening; an opaque  ZeroRows input is added here - when needed.
        result.add(new HiveInputSplit(new NullRowsInputFormat.DummyInputSplit(finalDirs[0].toString()), ZeroRowsInputFormat.class.getName()));
    }
}

Also used : Path(org.apache.hadoop.fs.Path) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 50 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class HiveInputFormat method getSplits.

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    PerfLogger perfLogger = SessionState.getPerfLogger();
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.GET_SPLITS);
    init(job);
    Path[] dirs = getInputPaths(job);
    JobConf newjob = new JobConf(job);
    List<InputSplit> result = new ArrayList<InputSplit>();
    List<Path> currentDirs = new ArrayList<Path>();
    Class<? extends InputFormat> currentInputFormatClass = null;
    TableDesc currentTable = null;
    TableScanOperator currentTableScan = null;
    boolean pushDownProjection = false;
    // Buffers to hold filter pushdown information
    StringBuilder readColumnsBuffer = new StringBuilder(newjob.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, ""));
    ;
    StringBuilder readColumnNamesBuffer = new StringBuilder(newjob.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, ""));
    // for each dir, get the InputFormat, and do getSplits.
    for (Path dir : dirs) {
        PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
        Class<? extends InputFormat> inputFormatClass = part.getInputFileFormatClass();
        TableDesc table = part.getTableDesc();
        TableScanOperator tableScan = null;
        List<String> aliases = mrwork.getPathToAliases().get(dir);
        // Make filter pushdown information available to getSplits.
        if ((aliases != null) && (aliases.size() == 1)) {
            Operator op = mrwork.getAliasToWork().get(aliases.get(0));
            if ((op != null) && (op instanceof TableScanOperator)) {
                tableScan = (TableScanOperator) op;
                // Reset buffers to store filter push down columns
                readColumnsBuffer.setLength(0);
                readColumnNamesBuffer.setLength(0);
                // push down projections.
                ColumnProjectionUtils.appendReadColumns(readColumnsBuffer, readColumnNamesBuffer, tableScan.getNeededColumnIDs(), tableScan.getNeededColumns());
                pushDownProjection = true;
                // push down filters
                pushFilters(newjob, tableScan, this.mrwork);
            }
        } else {
            if (LOG.isDebugEnabled()) {
                LOG.debug("aliases: {} pathToAliases: {} dir: {}", aliases, mrwork.getPathToAliases(), dir);
            }
        }
        if (!currentDirs.isEmpty() && inputFormatClass.equals(currentInputFormatClass) && table.equals(currentTable) && tableScan == currentTableScan) {
            currentDirs.add(dir);
            continue;
        }
        if (!currentDirs.isEmpty()) {
            if (LOG.isInfoEnabled()) {
                LOG.info("Generating splits as currentDirs is not empty. currentDirs: {}", currentDirs);
            }
            // set columns to read in conf
            if (pushDownProjection) {
                pushProjection(newjob, readColumnsBuffer, readColumnNamesBuffer);
            }
            addSplitsForGroup(currentDirs, currentTableScan, newjob, getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass, currentDirs.size() * (numSplits / dirs.length), currentTable, result);
        }
        currentDirs.clear();
        currentDirs.add(dir);
        currentTableScan = tableScan;
        currentTable = table;
        currentInputFormatClass = inputFormatClass;
    }
    // set columns to read in conf
    if (pushDownProjection) {
        pushProjection(newjob, readColumnsBuffer, readColumnNamesBuffer);
    }
    if (dirs.length != 0) {
        if (LOG.isInfoEnabled()) {
            LOG.info("Generating splits for dirs: {}", dirs);
        }
        addSplitsForGroup(currentDirs, currentTableScan, newjob, getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass, currentDirs.size() * (numSplits / dirs.length), currentTable, result);
    }
    Utilities.clearWorkMapForConf(job);
    if (LOG.isInfoEnabled()) {
        LOG.info("number of splits " + result.size());
    }
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.GET_SPLITS);
    return result.toArray(new HiveInputSplit[result.size()]);
}

Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) ArrayList(java.util.ArrayList) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) VectorPartitionDesc(org.apache.hadoop.hive.ql.plan.VectorPartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)161 Path (org.apache.hadoop.fs.Path)57 JobConf (org.apache.hadoop.mapred.JobConf)56 Test (org.junit.Test)49 IOException (java.io.IOException)47 ArrayList (java.util.ArrayList)29 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)27 FileSplit (org.apache.hadoop.mapred.FileSplit)24 FileSystem (org.apache.hadoop.fs.FileSystem)21 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)21 InputFormat (org.apache.hadoop.mapred.InputFormat)19 RecordWriter (org.apache.hadoop.mapred.RecordWriter)19 NullWritable (org.apache.hadoop.io.NullWritable)18 Text (org.apache.hadoop.io.Text)18 Configuration (org.apache.hadoop.conf.Configuration)14 LongWritable (org.apache.hadoop.io.LongWritable)11 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)10 Properties (java.util.Properties)9 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)9 HashMap (java.util.HashMap)8