Examples with InputFormat - org.apache.hadoop.mapred.InputFormat

Example 26 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project hadoop by apache.

the class DelegatingInputFormat method getSplits.

public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
    JobConf confCopy = new JobConf(conf);
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Map<Path, InputFormat> formatMap = MultipleInputs.getInputFormatMap(conf);
    Map<Path, Class<? extends Mapper>> mapperMap = MultipleInputs.getMapperTypeMap(conf);
    Map<Class<? extends InputFormat>, List<Path>> formatPaths = new HashMap<Class<? extends InputFormat>, List<Path>>();
    // First, build a map of InputFormats to Paths
    for (Entry<Path, InputFormat> entry : formatMap.entrySet()) {
        if (!formatPaths.containsKey(entry.getValue().getClass())) {
            formatPaths.put(entry.getValue().getClass(), new LinkedList<Path>());
        }
        formatPaths.get(entry.getValue().getClass()).add(entry.getKey());
    }
    for (Entry<Class<? extends InputFormat>, List<Path>> formatEntry : formatPaths.entrySet()) {
        Class<? extends InputFormat> formatClass = formatEntry.getKey();
        InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf);
        List<Path> paths = formatEntry.getValue();
        Map<Class<? extends Mapper>, List<Path>> mapperPaths = new HashMap<Class<? extends Mapper>, List<Path>>();
        // a map of Mappers to the paths they're used for
        for (Path path : paths) {
            Class<? extends Mapper> mapperClass = mapperMap.get(path);
            if (!mapperPaths.containsKey(mapperClass)) {
                mapperPaths.put(mapperClass, new LinkedList<Path>());
            }
            mapperPaths.get(mapperClass).add(path);
        }
        // be added to the same job, and split together.
        for (Entry<Class<? extends Mapper>, List<Path>> mapEntry : mapperPaths.entrySet()) {
            paths = mapEntry.getValue();
            Class<? extends Mapper> mapperClass = mapEntry.getKey();
            if (mapperClass == null) {
                mapperClass = conf.getMapperClass();
            }
            FileInputFormat.setInputPaths(confCopy, paths.toArray(new Path[paths.size()]));
            // Get splits for each input path and tag with InputFormat
            // and Mapper types by wrapping in a TaggedInputSplit.
            InputSplit[] pathSplits = format.getSplits(confCopy, numSplits);
            for (InputSplit pathSplit : pathSplits) {
                splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(), mapperClass));
            }
        }
    }
    return splits.toArray(new InputSplit[splits.size()]);
}

Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Mapper(org.apache.hadoop.mapred.Mapper) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) ArrayList(java.util.ArrayList) List(java.util.List) LinkedList(java.util.LinkedList) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 27 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project hadoop by apache.

the class TestMultipleInputs method testAddInputPathWithFormat.

@Test
public void testAddInputPathWithFormat() {
    final JobConf conf = new JobConf();
    MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class);
    MultipleInputs.addInputPath(conf, new Path("/bar"), KeyValueTextInputFormat.class);
    final Map<Path, InputFormat> inputs = MultipleInputs.getInputFormatMap(conf);
    assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
    assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar")).getClass());
}

Also used : Path(org.apache.hadoop.fs.Path) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) KeyValueTextInputFormat(org.apache.hadoop.mapred.KeyValueTextInputFormat) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.Test)

Example 28 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project presto by prestodb.

the class HiveUtil method createRecordReader.

public static RecordReader<?, ?> createRecordReader(Configuration configuration, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns) {
    // determine which hive columns we will read
    List<HiveColumnHandle> readColumns = ImmutableList.copyOf(filter(columns, column -> column.getColumnType() == REGULAR));
    List<Integer> readHiveColumnIndexes = ImmutableList.copyOf(transform(readColumns, HiveColumnHandle::getHiveColumnIndex));
    // Tell hive the columns we would like to read, this lets hive optimize reading column oriented files
    setReadColumns(configuration, readHiveColumnIndexes);
    InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, true);
    JobConf jobConf = new JobConf(configuration);
    FileSplit fileSplit = new FileSplit(path, start, length, (String[]) null);
    // propagate serialization configuration to getRecordReader
    schema.stringPropertyNames().stream().filter(name -> name.startsWith("serialization.")).forEach(name -> jobConf.set(name, schema.getProperty(name)));
    try {
        return retry().stopOnIllegalExceptions().run("createRecordReader", () -> inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL));
    } catch (Exception e) {
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s", path, start, length, getInputFormatName(schema), e.getMessage()), e);
    }
}

Also used : DateTimeZone(org.joda.time.DateTimeZone) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_DEFAULT_DYNAMIC_PARTITION(com.facebook.presto.hive.HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION) DECIMAL_TYPE_NAME(org.apache.hadoop.hive.serde.serdeConstants.DECIMAL_TYPE_NAME) Short.parseShort(java.lang.Short.parseShort) BIGINT(com.facebook.presto.spi.type.BigintType.BIGINT) BigDecimal(java.math.BigDecimal) FileSplit(org.apache.hadoop.mapred.FileSplit) Matcher(java.util.regex.Matcher) BOOLEAN(com.facebook.presto.spi.type.BooleanType.BOOLEAN) Slices(io.airlift.slice.Slices) Configuration(org.apache.hadoop.conf.Configuration) BigInteger(java.math.BigInteger) StandardTypes(com.facebook.presto.spi.type.StandardTypes) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) Double.parseDouble(java.lang.Double.parseDouble) SERIALIZATION_LIB(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB) Decimals(com.facebook.presto.spi.type.Decimals) TINYINT(com.facebook.presto.spi.type.TinyintType.TINYINT) HIVE_INVALID_METADATA(com.facebook.presto.hive.HiveErrorCode.HIVE_INVALID_METADATA) HIVE_SERDE_NOT_FOUND(com.facebook.presto.hive.HiveErrorCode.HIVE_SERDE_NOT_FOUND) READ_ALL_COLUMNS(org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_ALL_COLUMNS) HiveColumnHandle.bucketColumnHandle(com.facebook.presto.hive.HiveColumnHandle.bucketColumnHandle) InvocationTargetException(java.lang.reflect.InvocationTargetException) DecimalType.createDecimalType(com.facebook.presto.spi.type.DecimalType.createDecimalType) DateTimePrinter(org.joda.time.format.DateTimePrinter) RecordReader(org.apache.hadoop.mapred.RecordReader) Iterables.filter(com.google.common.collect.Iterables.filter) Joiner(com.google.common.base.Joiner) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) DOUBLE(com.facebook.presto.spi.type.DoubleType.DOUBLE) Table(com.facebook.presto.hive.metastore.Table) Slice(io.airlift.slice.Slice) DateTimeFormatterBuilder(org.joda.time.format.DateTimeFormatterBuilder) REGULAR(com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR) Float.floatToRawIntBits(java.lang.Float.floatToRawIntBits) Byte.parseByte(java.lang.Byte.parseByte) Type(com.facebook.presto.spi.type.Type) TIMESTAMP(com.facebook.presto.spi.type.TimestampType.TIMESTAMP) Nullable(javax.annotation.Nullable) Properties(java.util.Properties) Reporter(org.apache.hadoop.mapred.Reporter) HiveColumnHandle.pathColumnHandle(com.facebook.presto.hive.HiveColumnHandle.pathColumnHandle) Throwables(com.google.common.base.Throwables) HiveColumnHandle.isBucketColumnHandle(com.facebook.presto.hive.HiveColumnHandle.isBucketColumnHandle) Deserializer(org.apache.hadoop.hive.serde2.Deserializer) VarcharType(com.facebook.presto.spi.type.VarcharType) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) Long.parseLong(java.lang.Long.parseLong) ReflectionUtils(org.apache.hadoop.util.ReflectionUtils) DateTimeParser(org.joda.time.format.DateTimeParser) HIVE_INVALID_VIEW_DATA(com.facebook.presto.hive.HiveErrorCode.HIVE_INVALID_VIEW_DATA) Float.parseFloat(java.lang.Float.parseFloat) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) SchemaTableName(com.facebook.presto.spi.SchemaTableName) DecimalType(com.facebook.presto.spi.type.DecimalType) ParquetHiveSerDe(org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe) InputFormat(org.apache.hadoop.mapred.InputFormat) Path(org.apache.hadoop.fs.Path) NullableValue(com.facebook.presto.spi.predicate.NullableValue) Method(java.lang.reflect.Method) SliceUtf8(io.airlift.slice.SliceUtf8) DateTimeFormat(org.joda.time.format.DateTimeFormat) ISODateTimeFormat(org.joda.time.format.ISODateTimeFormat) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) RetryDriver.retry(com.facebook.presto.hive.RetryDriver.retry) String.format(java.lang.String.format) RecordCursor(com.facebook.presto.spi.RecordCursor) Base64(java.util.Base64) List(java.util.List) NOT_SUPPORTED(com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED) Chars.trimSpaces(com.facebook.presto.spi.type.Chars.trimSpaces) Optional(java.util.Optional) ErrorCodeSupplier(com.facebook.presto.spi.ErrorCodeSupplier) READ_COLUMN_IDS_CONF_STR(org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR) Pattern(java.util.regex.Pattern) INTEGER(com.facebook.presto.spi.type.IntegerType.INTEGER) PARTITION_KEY(com.facebook.presto.hive.HiveColumnHandle.ColumnType.PARTITION_KEY) FileUtils.unescapePathName(org.apache.hadoop.hive.common.FileUtils.unescapePathName) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) Column(com.facebook.presto.hive.metastore.Column) ROUND_UNNECESSARY(java.math.BigDecimal.ROUND_UNNECESSARY) HIVE_INVALID_PARTITION_VALUE(com.facebook.presto.hive.HiveErrorCode.HIVE_INVALID_PARTITION_VALUE) ConnectorTableHandle(com.facebook.presto.spi.ConnectorTableHandle) Lists.transform(com.google.common.collect.Lists.transform) PrestoException(com.facebook.presto.spi.PrestoException) OptionalInt(java.util.OptionalInt) HiveColumnHandle.isPathColumnHandle(com.facebook.presto.hive.HiveColumnHandle.isPathColumnHandle) HIVE_CANNOT_OPEN_SPLIT(com.facebook.presto.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat) Category(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) DateTimeFormatter(org.joda.time.format.DateTimeFormatter) UTF_8(java.nio.charset.StandardCharsets.UTF_8) SMALLINT(com.facebook.presto.spi.type.SmallintType.SMALLINT) Integer.parseInt(java.lang.Integer.parseInt) JavaUtils(org.apache.hadoop.hive.common.JavaUtils) CharType(com.facebook.presto.spi.type.CharType) JobConf(org.apache.hadoop.mapred.JobConf) TimeUnit(java.util.concurrent.TimeUnit) DATE(com.facebook.presto.spi.type.DateType.DATE) REAL(com.facebook.presto.spi.type.RealType.REAL) FILE_INPUT_FORMAT(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT) MetastoreUtil.getHiveSchema(com.facebook.presto.hive.metastore.MetastoreUtil.getHiveSchema) Chars.isCharType(com.facebook.presto.spi.type.Chars.isCharType) BigInteger(java.math.BigInteger) PrestoException(com.facebook.presto.spi.PrestoException) FileSplit(org.apache.hadoop.mapred.FileSplit) JobConf(org.apache.hadoop.mapred.JobConf) InvocationTargetException(java.lang.reflect.InvocationTargetException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) PrestoException(com.facebook.presto.spi.PrestoException)

Example 29 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project drill by apache.

the class ConvertHiveParquetScanToDrillParquetScan method getInputFormatFromSD.

/**
   * Get the input format from given {@link StorageDescriptor}
   * @param properties
   * @param hiveReadEntry
   * @param sd
   * @return {@link InputFormat} class or null if a failure has occurred. Failure is logged as warning.
   */
private Class<? extends InputFormat<?, ?>> getInputFormatFromSD(final Properties properties, final HiveReadEntry hiveReadEntry, final StorageDescriptor sd, final HiveConf hiveConf) {
    final Table hiveTable = hiveReadEntry.getTable();
    try {
        final String inputFormatName = sd.getInputFormat();
        if (!Strings.isNullOrEmpty(inputFormatName)) {
            return (Class<? extends InputFormat<?, ?>>) Class.forName(inputFormatName);
        }
        final JobConf job = new JobConf(hiveConf);
        HiveUtilities.addConfToJob(job, properties);
        return HiveUtilities.getInputFormatClass(job, sd, hiveTable);
    } catch (final Exception e) {
        logger.warn("Failed to get InputFormat class from Hive table '{}.{}'. StorageDescriptor [{}]", hiveTable.getDbName(), hiveTable.getTableName(), sd.toString(), e);
        return null;
    }
}

Also used : Table(org.apache.hadoop.hive.metastore.api.Table) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) JobConf(org.apache.hadoop.mapred.JobConf)

Example 30 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project drill by apache.

the class HiveMetadataProvider method splitInputWithUGI.

private List<InputSplitWrapper> splitInputWithUGI(final Properties properties, final StorageDescriptor sd, final Partition partition) throws Exception {
    watch.start();
    try {
        return ugi.doAs(new PrivilegedExceptionAction<List<InputSplitWrapper>>() {

            public List<InputSplitWrapper> run() throws Exception {
                final List<InputSplitWrapper> splits = Lists.newArrayList();
                final JobConf job = new JobConf(hiveConf);
                HiveUtilities.addConfToJob(job, properties);
                job.setInputFormat(HiveUtilities.getInputFormatClass(job, sd, hiveReadEntry.getTable()));
                final Path path = new Path(sd.getLocation());
                final FileSystem fs = path.getFileSystem(job);
                if (fs.exists(path)) {
                    FileInputFormat.addInputPath(job, path);
                    final InputFormat<?, ?> format = job.getInputFormat();
                    for (final InputSplit split : format.getSplits(job, 1)) {
                        splits.add(new InputSplitWrapper(split, partition));
                    }
                }
                return splits;
            }
        });
    } catch (final InterruptedException | IOException e) {
        final String errMsg = String.format("Failed to create input splits: %s", e.getMessage());
        logger.error(errMsg, e);
        throw new DrillRuntimeException(errMsg, e);
    } finally {
        logger.trace("Took {} µs to get splits from {}", watch.elapsed(TimeUnit.NANOSECONDS) / 1000, sd.getLocation());
        watch.stop();
    }
}

Also used : Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) DrillRuntimeException(org.apache.drill.common.exceptions.DrillRuntimeException) IOException(java.io.IOException) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) List(java.util.List) DrillRuntimeException(org.apache.drill.common.exceptions.DrillRuntimeException) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Aggregations

InputFormat (org.apache.hadoop.mapred.InputFormat)34 Path (org.apache.hadoop.fs.Path)23 JobConf (org.apache.hadoop.mapred.JobConf)20 InputSplit (org.apache.hadoop.mapred.InputSplit)19 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)15 IOException (java.io.IOException)11 FileSystem (org.apache.hadoop.fs.FileSystem)8 ArrayList (java.util.ArrayList)7 HashMap (java.util.HashMap)7 Configuration (org.apache.hadoop.conf.Configuration)6 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)6 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)6 HiveInputFormat (org.apache.hadoop.hive.ql.io.HiveInputFormat)5 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)5 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)5 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)5 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)5 NullWritable (org.apache.hadoop.io.NullWritable)5 Mapper (org.apache.hadoop.mapred.Mapper)5 Test (org.junit.Test)5