Search in sources :

Example 11 with RecordReader

use of org.apache.hadoop.mapred.RecordReader in project hive by apache.

the class VectorizedOrcInputFormat method getRecordReader.

@Override
public RecordReader<NullWritable, VectorizedRowBatch> getRecordReader(InputSplit inputSplit, JobConf conf, Reporter reporter) throws IOException {
    FileSplit fSplit = (FileSplit) inputSplit;
    reporter.setStatus(fSplit.toString());
    Path path = fSplit.getPath();
    OrcFile.ReaderOptions opts = OrcFile.readerOptions(conf);
    if (fSplit instanceof OrcSplit) {
        OrcSplit orcSplit = (OrcSplit) fSplit;
        if (orcSplit.hasFooter()) {
            opts.orcTail(orcSplit.getOrcTail());
        }
        opts.maxLength(orcSplit.getFileLength());
    }
    Reader reader = OrcFile.createReader(path, opts);
    return new VectorizedOrcRecordReader(reader, conf, fSplit);
}
Also used : Path(org.apache.hadoop.fs.Path) RecordReader(org.apache.hadoop.mapred.RecordReader) FileSplit(org.apache.hadoop.mapred.FileSplit)

Example 12 with RecordReader

use of org.apache.hadoop.mapred.RecordReader in project hive by apache.

the class SerDeEncodedDataReader method startReadSplitFromFile.

public void startReadSplitFromFile(FileSplit split, boolean[] splitIncludes, StripeData slice) throws IOException {
    boolean maySplitTheSplit = slice == null;
    ReaderWithOffsets offsetReader = null;
    @SuppressWarnings("rawtypes") RecordReader sourceReader = sourceInputFormat.getRecordReader(split, jobConf, reporter);
    try {
        offsetReader = createOffsetReader(sourceReader);
        sourceReader = null;
    } finally {
        if (sourceReader != null) {
            try {
                sourceReader.close();
            } catch (Exception ex) {
                LlapIoImpl.LOG.error("Failed to close source reader", ex);
            }
        }
    }
    maySplitTheSplit = maySplitTheSplit && offsetReader.hasOffsets();
    try {
        StructObjectInspector originalOi = (StructObjectInspector) getOiFromSerDe();
        List<Integer> splitColumnIds = OrcInputFormat.genIncludedColumnsReverse(schema, splitIncludes, false);
        // fileread writes to the writer, which writes to orcWriter, which writes to cacheWriter
        EncodingWriter writer = VectorDeserializeOrcWriter.create(sourceInputFormat, sourceSerDe, parts, daemonConf, jobConf, split.getPath(), originalOi, splitColumnIds, splitIncludes, allocSize);
        // TODO: move this into ctor? EW would need to create CacheWriter then
        List<Integer> cwColIds = writer.isOnlyWritingIncludedColumns() ? splitColumnIds : columnIds;
        writer.init(new CacheWriter(bufferManager, cwColIds, splitIncludes, writer.isOnlyWritingIncludedColumns()), daemonConf, split.getPath());
        if (writer instanceof VectorDeserializeOrcWriter) {
            VectorDeserializeOrcWriter asyncWriter = (VectorDeserializeOrcWriter) writer;
            asyncWriter.startAsync(new AsyncCacheDataCallback());
            this.asyncWriters.add(asyncWriter);
        }
        currentFileRead = new FileReaderYieldReturn(offsetReader, split, writer, maySplitTheSplit, targetSliceRowCount);
    } finally {
        // Assignment is the last thing in the try, so if it happen we assume success.
        if (currentFileRead != null)
            return;
        if (offsetReader == null)
            return;
        try {
            offsetReader.close();
        } catch (Exception ex) {
            LlapIoImpl.LOG.error("Failed to close source reader", ex);
        }
    }
}
Also used : RecordReader(org.apache.hadoop.mapred.RecordReader) LineRecordReader(org.apache.hadoop.mapred.LineRecordReader) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) CacheWriter(org.apache.hadoop.hive.llap.io.encoded.SerDeEncodedDataReader.CacheWriter) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 13 with RecordReader

use of org.apache.hadoop.mapred.RecordReader in project hadoop by apache.

the class TestPipeApplication method testApplication.

/**
   * test org.apache.hadoop.mapred.pipes.Application
   * test a internal functions: MessageType.REGISTER_COUNTER,  INCREMENT_COUNTER, STATUS, PROGRESS...
   *
   * @throws Throwable
   */
@Test
public void testApplication() throws Throwable {
    JobConf conf = new JobConf();
    RecordReader<FloatWritable, NullWritable> rReader = new Reader();
    // client for test
    File fCommand = getFileCommand("org.apache.hadoop.mapred.pipes.PipeApplicationStub");
    TestTaskReporter reporter = new TestTaskReporter();
    File[] psw = cleanTokenPasswordFile();
    try {
        conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskName);
        conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath());
        // token for authorization
        Token<AMRMTokenIdentifier> token = new Token<AMRMTokenIdentifier>("user".getBytes(), "password".getBytes(), new Text("kind"), new Text("service"));
        TokenCache.setJobToken(token, conf.getCredentials());
        FakeCollector output = new FakeCollector(new Counters.Counter(), new Progress());
        FileSystem fs = new RawLocalFileSystem();
        fs.initialize(FsConstants.LOCAL_FS_URI, conf);
        Writer<IntWritable, Text> wr = new Writer<IntWritable, Text>(conf, fs.create(new Path(workSpace.getAbsolutePath() + File.separator + "outfile")), IntWritable.class, Text.class, null, null, true);
        output.setWriter(wr);
        conf.set(Submitter.PRESERVE_COMMANDFILE, "true");
        initStdOut(conf);
        Application<WritableComparable<IntWritable>, Writable, IntWritable, Text> application = new Application<WritableComparable<IntWritable>, Writable, IntWritable, Text>(conf, rReader, output, reporter, IntWritable.class, Text.class);
        application.getDownlink().flush();
        application.getDownlink().mapItem(new IntWritable(3), new Text("txt"));
        application.getDownlink().flush();
        application.waitForFinish();
        wr.close();
        // test getDownlink().mapItem();
        String stdOut = readStdOut(conf);
        assertTrue(stdOut.contains("key:3"));
        assertTrue(stdOut.contains("value:txt"));
        // reporter test counter, and status should be sended
        // test MessageType.REGISTER_COUNTER and INCREMENT_COUNTER
        assertEquals(1.0, reporter.getProgress(), 0.01);
        assertNotNull(reporter.getCounter("group", "name"));
        // test status MessageType.STATUS
        assertEquals(reporter.getStatus(), "PROGRESS");
        stdOut = readFile(new File(workSpace.getAbsolutePath() + File.separator + "outfile"));
        // check MessageType.PROGRESS
        assertEquals(0.55f, rReader.getProgress(), 0.001);
        application.getDownlink().close();
        // test MessageType.OUTPUT
        Entry<IntWritable, Text> entry = output.getCollect().entrySet().iterator().next();
        assertEquals(123, entry.getKey().get());
        assertEquals("value", entry.getValue().toString());
        try {
            // try to abort
            application.abort(new Throwable());
            fail();
        } catch (IOException e) {
            // abort works ?
            assertEquals("pipe child exception", e.getMessage());
        }
    } finally {
        if (psw != null) {
            // remove password files
            for (File file : psw) {
                file.deleteOnExit();
            }
        }
    }
}
Also used : RawLocalFileSystem(org.apache.hadoop.fs.RawLocalFileSystem) RecordReader(org.apache.hadoop.mapred.RecordReader) NullWritable(org.apache.hadoop.io.NullWritable) Writable(org.apache.hadoop.io.Writable) IntWritable(org.apache.hadoop.io.IntWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) Token(org.apache.hadoop.security.token.Token) AMRMTokenIdentifier(org.apache.hadoop.yarn.security.AMRMTokenIdentifier) FileSystem(org.apache.hadoop.fs.FileSystem) RawLocalFileSystem(org.apache.hadoop.fs.RawLocalFileSystem) JobConf(org.apache.hadoop.mapred.JobConf) IntWritable(org.apache.hadoop.io.IntWritable) Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) NullWritable(org.apache.hadoop.io.NullWritable) Counter(org.apache.hadoop.mapred.Counters.Counter) FloatWritable(org.apache.hadoop.io.FloatWritable) WritableComparable(org.apache.hadoop.io.WritableComparable) Counters(org.apache.hadoop.mapred.Counters) File(java.io.File) Writer(org.apache.hadoop.mapred.IFile.Writer) Test(org.junit.Test)

Example 14 with RecordReader

use of org.apache.hadoop.mapred.RecordReader in project hadoop by apache.

the class AutoInputFormat method getRecordReader.

public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    FileSplit fileSplit = (FileSplit) split;
    FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
    FSDataInputStream is = fs.open(fileSplit.getPath());
    byte[] header = new byte[3];
    RecordReader reader = null;
    try {
        is.readFully(header);
    } catch (EOFException eof) {
        reader = textInputFormat.getRecordReader(split, job, reporter);
    } finally {
        is.close();
    }
    if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
        reader = seqFileInputFormat.getRecordReader(split, job, reporter);
    } else {
        reader = textInputFormat.getRecordReader(split, job, reporter);
    }
    return reader;
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) RecordReader(org.apache.hadoop.mapred.RecordReader) EOFException(java.io.EOFException) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) FileSplit(org.apache.hadoop.mapred.FileSplit)

Example 15 with RecordReader

use of org.apache.hadoop.mapred.RecordReader in project presto by prestodb.

the class HiveUtil method createRecordReader.

public static RecordReader<?, ?> createRecordReader(Configuration configuration, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns) {
    // determine which hive columns we will read
    List<HiveColumnHandle> readColumns = ImmutableList.copyOf(filter(columns, column -> column.getColumnType() == REGULAR));
    List<Integer> readHiveColumnIndexes = ImmutableList.copyOf(transform(readColumns, HiveColumnHandle::getHiveColumnIndex));
    // Tell hive the columns we would like to read, this lets hive optimize reading column oriented files
    setReadColumns(configuration, readHiveColumnIndexes);
    InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, true);
    JobConf jobConf = new JobConf(configuration);
    FileSplit fileSplit = new FileSplit(path, start, length, (String[]) null);
    // propagate serialization configuration to getRecordReader
    schema.stringPropertyNames().stream().filter(name -> name.startsWith("serialization.")).forEach(name -> jobConf.set(name, schema.getProperty(name)));
    try {
        return retry().stopOnIllegalExceptions().run("createRecordReader", () -> inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL));
    } catch (Exception e) {
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s", path, start, length, getInputFormatName(schema), e.getMessage()), e);
    }
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_DEFAULT_DYNAMIC_PARTITION(com.facebook.presto.hive.HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION) DECIMAL_TYPE_NAME(org.apache.hadoop.hive.serde.serdeConstants.DECIMAL_TYPE_NAME) Short.parseShort(java.lang.Short.parseShort) BIGINT(com.facebook.presto.spi.type.BigintType.BIGINT) BigDecimal(java.math.BigDecimal) FileSplit(org.apache.hadoop.mapred.FileSplit) Matcher(java.util.regex.Matcher) BOOLEAN(com.facebook.presto.spi.type.BooleanType.BOOLEAN) Slices(io.airlift.slice.Slices) Configuration(org.apache.hadoop.conf.Configuration) BigInteger(java.math.BigInteger) StandardTypes(com.facebook.presto.spi.type.StandardTypes) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) Double.parseDouble(java.lang.Double.parseDouble) SERIALIZATION_LIB(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB) Decimals(com.facebook.presto.spi.type.Decimals) TINYINT(com.facebook.presto.spi.type.TinyintType.TINYINT) HIVE_INVALID_METADATA(com.facebook.presto.hive.HiveErrorCode.HIVE_INVALID_METADATA) HIVE_SERDE_NOT_FOUND(com.facebook.presto.hive.HiveErrorCode.HIVE_SERDE_NOT_FOUND) READ_ALL_COLUMNS(org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_ALL_COLUMNS) HiveColumnHandle.bucketColumnHandle(com.facebook.presto.hive.HiveColumnHandle.bucketColumnHandle) InvocationTargetException(java.lang.reflect.InvocationTargetException) DecimalType.createDecimalType(com.facebook.presto.spi.type.DecimalType.createDecimalType) DateTimePrinter(org.joda.time.format.DateTimePrinter) RecordReader(org.apache.hadoop.mapred.RecordReader) Iterables.filter(com.google.common.collect.Iterables.filter) Joiner(com.google.common.base.Joiner) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) DOUBLE(com.facebook.presto.spi.type.DoubleType.DOUBLE) Table(com.facebook.presto.hive.metastore.Table) Slice(io.airlift.slice.Slice) DateTimeFormatterBuilder(org.joda.time.format.DateTimeFormatterBuilder) REGULAR(com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR) Float.floatToRawIntBits(java.lang.Float.floatToRawIntBits) Byte.parseByte(java.lang.Byte.parseByte) Type(com.facebook.presto.spi.type.Type) TIMESTAMP(com.facebook.presto.spi.type.TimestampType.TIMESTAMP) Nullable(javax.annotation.Nullable) Properties(java.util.Properties) Reporter(org.apache.hadoop.mapred.Reporter) HiveColumnHandle.pathColumnHandle(com.facebook.presto.hive.HiveColumnHandle.pathColumnHandle) Throwables(com.google.common.base.Throwables) HiveColumnHandle.isBucketColumnHandle(com.facebook.presto.hive.HiveColumnHandle.isBucketColumnHandle) Deserializer(org.apache.hadoop.hive.serde2.Deserializer) VarcharType(com.facebook.presto.spi.type.VarcharType) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) Long.parseLong(java.lang.Long.parseLong) ReflectionUtils(org.apache.hadoop.util.ReflectionUtils) DateTimeParser(org.joda.time.format.DateTimeParser) HIVE_INVALID_VIEW_DATA(com.facebook.presto.hive.HiveErrorCode.HIVE_INVALID_VIEW_DATA) Float.parseFloat(java.lang.Float.parseFloat) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) SchemaTableName(com.facebook.presto.spi.SchemaTableName) DecimalType(com.facebook.presto.spi.type.DecimalType) ParquetHiveSerDe(org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe) InputFormat(org.apache.hadoop.mapred.InputFormat) Path(org.apache.hadoop.fs.Path) NullableValue(com.facebook.presto.spi.predicate.NullableValue) Method(java.lang.reflect.Method) SliceUtf8(io.airlift.slice.SliceUtf8) DateTimeFormat(org.joda.time.format.DateTimeFormat) ISODateTimeFormat(org.joda.time.format.ISODateTimeFormat) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) RetryDriver.retry(com.facebook.presto.hive.RetryDriver.retry) String.format(java.lang.String.format) RecordCursor(com.facebook.presto.spi.RecordCursor) Base64(java.util.Base64) List(java.util.List) NOT_SUPPORTED(com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED) Chars.trimSpaces(com.facebook.presto.spi.type.Chars.trimSpaces) Optional(java.util.Optional) ErrorCodeSupplier(com.facebook.presto.spi.ErrorCodeSupplier) READ_COLUMN_IDS_CONF_STR(org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR) Pattern(java.util.regex.Pattern) INTEGER(com.facebook.presto.spi.type.IntegerType.INTEGER) PARTITION_KEY(com.facebook.presto.hive.HiveColumnHandle.ColumnType.PARTITION_KEY) FileUtils.unescapePathName(org.apache.hadoop.hive.common.FileUtils.unescapePathName) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) Column(com.facebook.presto.hive.metastore.Column) ROUND_UNNECESSARY(java.math.BigDecimal.ROUND_UNNECESSARY) HIVE_INVALID_PARTITION_VALUE(com.facebook.presto.hive.HiveErrorCode.HIVE_INVALID_PARTITION_VALUE) ConnectorTableHandle(com.facebook.presto.spi.ConnectorTableHandle) Lists.transform(com.google.common.collect.Lists.transform) PrestoException(com.facebook.presto.spi.PrestoException) OptionalInt(java.util.OptionalInt) HiveColumnHandle.isPathColumnHandle(com.facebook.presto.hive.HiveColumnHandle.isPathColumnHandle) HIVE_CANNOT_OPEN_SPLIT(com.facebook.presto.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat) Category(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) DateTimeFormatter(org.joda.time.format.DateTimeFormatter) UTF_8(java.nio.charset.StandardCharsets.UTF_8) SMALLINT(com.facebook.presto.spi.type.SmallintType.SMALLINT) Integer.parseInt(java.lang.Integer.parseInt) JavaUtils(org.apache.hadoop.hive.common.JavaUtils) CharType(com.facebook.presto.spi.type.CharType) JobConf(org.apache.hadoop.mapred.JobConf) TimeUnit(java.util.concurrent.TimeUnit) DATE(com.facebook.presto.spi.type.DateType.DATE) REAL(com.facebook.presto.spi.type.RealType.REAL) FILE_INPUT_FORMAT(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT) MetastoreUtil.getHiveSchema(com.facebook.presto.hive.metastore.MetastoreUtil.getHiveSchema) Chars.isCharType(com.facebook.presto.spi.type.Chars.isCharType) BigInteger(java.math.BigInteger) PrestoException(com.facebook.presto.spi.PrestoException) FileSplit(org.apache.hadoop.mapred.FileSplit) JobConf(org.apache.hadoop.mapred.JobConf) InvocationTargetException(java.lang.reflect.InvocationTargetException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) PrestoException(com.facebook.presto.spi.PrestoException)

Aggregations

RecordReader (org.apache.hadoop.mapred.RecordReader)17 Path (org.apache.hadoop.fs.Path)9 FileSplit (org.apache.hadoop.mapred.FileSplit)8 IOException (java.io.IOException)7 JobConf (org.apache.hadoop.mapred.JobConf)7 InputSplit (org.apache.hadoop.mapred.InputSplit)6 FileSystem (org.apache.hadoop.fs.FileSystem)5 InputFormat (org.apache.hadoop.mapred.InputFormat)4 Text (org.apache.hadoop.io.Text)3 Configuration (org.apache.hadoop.conf.Configuration)2 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)2 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)2 SequenceFile (org.apache.hadoop.io.SequenceFile)2 Reporter (org.apache.hadoop.mapred.Reporter)2 PARTITION_KEY (com.facebook.presto.hive.HiveColumnHandle.ColumnType.PARTITION_KEY)1 REGULAR (com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR)1 HiveColumnHandle.bucketColumnHandle (com.facebook.presto.hive.HiveColumnHandle.bucketColumnHandle)1 HiveColumnHandle.isBucketColumnHandle (com.facebook.presto.hive.HiveColumnHandle.isBucketColumnHandle)1 HiveColumnHandle.isPathColumnHandle (com.facebook.presto.hive.HiveColumnHandle.isPathColumnHandle)1 HiveColumnHandle.pathColumnHandle (com.facebook.presto.hive.HiveColumnHandle.pathColumnHandle)1