Examples with RecordReader - org.apache.hadoop.mapred.RecordReader

Example 6 with RecordReader

use of org.apache.hadoop.mapred.RecordReader in project hive by apache.

the class HiveInputFormat method getRecordReader.

public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    HiveInputSplit hsplit = (HiveInputSplit) split;
    InputSplit inputSplit = hsplit.getInputSplit();
    String inputFormatClassName = null;
    Class inputFormatClass = null;
    try {
        inputFormatClassName = hsplit.inputFormatClassName();
        inputFormatClass = job.getClassByName(inputFormatClassName);
    } catch (Exception e) {
        throw new IOException("cannot find class " + inputFormatClassName, e);
    }
    if (this.mrwork == null || pathToPartitionInfo == null) {
        init(job);
    }
    boolean nonNative = false;
    PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, hsplit.getPath(), null);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Found spec for " + hsplit.getPath() + " " + part + " from " + pathToPartitionInfo);
    }
    if ((part != null) && (part.getTableDesc() != null)) {
        Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), job);
        nonNative = part.getTableDesc().isNonNative();
    }
    Path splitPath = hsplit.getPath();
    pushProjectionsAndFilters(job, inputFormatClass, splitPath, nonNative);
    InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
    try {
        inputFormat = HiveInputFormat.wrapForLlap(inputFormat, job, part);
    } catch (HiveException e) {
        throw new IOException(e);
    }
    RecordReader innerReader = null;
    try {
        innerReader = inputFormat.getRecordReader(inputSplit, job, reporter);
    } catch (Exception e) {
        innerReader = HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(e, job);
    }
    HiveRecordReader<K, V> rr = new HiveRecordReader(innerReader, job);
    rr.initIOContext(hsplit, job, inputFormatClass, innerReader);
    return rr;
}

Also used : Path(org.apache.hadoop.fs.Path) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) RecordReader(org.apache.hadoop.mapred.RecordReader) IOException(java.io.IOException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) IOException(java.io.IOException) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) VectorPartitionDesc(org.apache.hadoop.hive.ql.plan.VectorPartitionDesc) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 7 with RecordReader

use of org.apache.hadoop.mapred.RecordReader in project trevni by cutting.

the class AvroTrevniInputFormat method getRecordReader.

@Override
public RecordReader<AvroWrapper<T>, NullWritable> getRecordReader(InputSplit split, final JobConf job, Reporter reporter) throws IOException {
    final FileSplit file = (FileSplit) split;
    reporter.setStatus(file.toString());
    final AvroColumnReader.Params params = new AvroColumnReader.Params(new HadoopInput(file.getPath(), job));
    params.setModel(ReflectData.get());
    if (job.get(AvroJob.INPUT_SCHEMA) != null)
        params.setSchema(AvroJob.getInputSchema(job));
    return new RecordReader<AvroWrapper<T>, NullWritable>() {

        private AvroColumnReader<T> reader = new AvroColumnReader<T>(params);

        private float rows = reader.getRowCount();

        private long row;

        public AvroWrapper<T> createKey() {
            return new AvroWrapper<T>(null);
        }

        public NullWritable createValue() {
            return NullWritable.get();
        }

        public boolean next(AvroWrapper<T> wrapper, NullWritable ignore) throws IOException {
            if (!reader.hasNext())
                return false;
            wrapper.datum(reader.next());
            row++;
            return true;
        }

        public float getProgress() throws IOException {
            return row / rows;
        }

        public long getPos() throws IOException {
            return row;
        }

        public void close() throws IOException {
            reader.close();
        }
    };
}

Also used : RecordReader(org.apache.hadoop.mapred.RecordReader) AvroWrapper(org.apache.avro.mapred.AvroWrapper) FileSplit(org.apache.hadoop.mapred.FileSplit) NullWritable(org.apache.hadoop.io.NullWritable)

Example 8 with RecordReader

use of org.apache.hadoop.mapred.RecordReader in project asterixdb by apache.

the class HDFSReadOperatorDescriptor method createPushRuntime.

@Override
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions) throws HyracksDataException {
    final InputSplit[] inputSplits = splitsFactory.getSplits();
    return new AbstractUnaryOutputSourceOperatorNodePushable() {

        private String nodeName = ctx.getJobletContext().getServiceContext().getNodeId();

        @SuppressWarnings("unchecked")
        @Override
        public void initialize() throws HyracksDataException {
            ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
            try {
                writer.open();
                Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader());
                JobConf conf = confFactory.getConf();
                conf.setClassLoader(ctx.getJobletContext().getClassLoader());
                IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx);
                try {
                    parser.open(writer);
                    InputFormat inputFormat = conf.getInputFormat();
                    for (int i = 0; i < inputSplits.length; i++) {
                        /**
                             * read all the partitions scheduled to the current node
                             */
                        if (scheduledLocations[i].equals(nodeName)) {
                            /**
                                 * pick an unread split to read
                                 * synchronize among simultaneous partitions in the same machine
                                 */
                            synchronized (executed) {
                                if (executed[i] == false) {
                                    executed[i] = true;
                                } else {
                                    continue;
                                }
                            }
                            /**
                                 * read the split
                                 */
                            RecordReader reader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL);
                            Object key = reader.createKey();
                            Object value = reader.createValue();
                            while (reader.next(key, value) == true) {
                                parser.parse(key, value, writer, inputSplits[i].toString());
                            }
                        }
                    }
                } finally {
                    parser.close(writer);
                }
            } catch (Throwable th) {
                writer.fail();
                throw new HyracksDataException(th);
            } finally {
                writer.close();
                Thread.currentThread().setContextClassLoader(ctxCL);
            }
        }
    };
}

Also used : AbstractUnaryOutputSourceOperatorNodePushable(org.apache.hyracks.dataflow.std.base.AbstractUnaryOutputSourceOperatorNodePushable) RecordReader(org.apache.hadoop.mapred.RecordReader) HyracksDataException(org.apache.hyracks.api.exceptions.HyracksDataException) IKeyValueParser(org.apache.hyracks.hdfs.api.IKeyValueParser) InputFormat(org.apache.hadoop.mapred.InputFormat) InputSplit(org.apache.hadoop.mapred.InputSplit) JobConf(org.apache.hadoop.mapred.JobConf)

Example 9 with RecordReader

use of org.apache.hadoop.mapred.RecordReader in project hbase by apache.

the class TableInputFormatBase method getRecordReader.

/**
   * Builds a TableRecordReader. If no TableRecordReader was provided, uses
   * the default.
   *
   * @see org.apache.hadoop.mapred.InputFormat#getRecordReader(InputSplit,
   *      JobConf, Reporter)
   */
public RecordReader<ImmutableBytesWritable, Result> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    // In case a subclass uses the deprecated approach or calls initializeTable directly
    if (table == null) {
        initialize(job);
    }
    // null check in case our child overrides getTable to not throw.
    try {
        if (getTable() == null) {
            // initialize() must not have been implemented in the subclass.
            throw new IOException(INITIALIZATION_ERROR);
        }
    } catch (IllegalStateException exception) {
        throw new IOException(INITIALIZATION_ERROR, exception);
    }
    TableSplit tSplit = (TableSplit) split;
    // if no table record reader was provided use default
    final TableRecordReader trr = this.tableRecordReader == null ? new TableRecordReader() : this.tableRecordReader;
    trr.setStartRow(tSplit.getStartRow());
    trr.setEndRow(tSplit.getEndRow());
    trr.setHTable(this.table);
    trr.setInputColumns(this.inputColumns);
    trr.setRowFilter(this.rowFilter);
    trr.init();
    return new RecordReader<ImmutableBytesWritable, Result>() {

        @Override
        public void close() throws IOException {
            trr.close();
            closeTable();
        }

        @Override
        public ImmutableBytesWritable createKey() {
            return trr.createKey();
        }

        @Override
        public Result createValue() {
            return trr.createValue();
        }

        @Override
        public long getPos() throws IOException {
            return trr.getPos();
        }

        @Override
        public float getProgress() throws IOException {
            return trr.getProgress();
        }

        @Override
        public boolean next(ImmutableBytesWritable key, Result value) throws IOException {
            return trr.next(key, value);
        }
    };
}

Also used : ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) RecordReader(org.apache.hadoop.mapred.RecordReader) IOException(java.io.IOException) Result(org.apache.hadoop.hbase.client.Result)

Example 10 with RecordReader

use of org.apache.hadoop.mapred.RecordReader in project hive by apache.

the class TestRCFile method writeThenReadByRecordReader.

private void writeThenReadByRecordReader(int intervalRecordCount, int writeCount, int splitNumber, long minSplitSize, CompressionCodec codec) throws IOException {
    Path testDir = new Path(System.getProperty("test.tmp.dir", ".") + "/mapred/testsmallfirstsplit");
    Path testFile = new Path(testDir, "test_rcfile");
    fs.delete(testFile, true);
    Configuration cloneConf = new Configuration(conf);
    RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length);
    cloneConf.setInt(HiveConf.ConfVars.HIVE_RCFILE_RECORD_INTERVAL.varname, intervalRecordCount);
    RCFile.Writer writer = new RCFile.Writer(fs, cloneConf, testFile, null, codec);
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(bytesArray.length);
    for (int i = 0; i < bytesArray.length; i++) {
        BytesRefWritable cu = null;
        cu = new BytesRefWritable(bytesArray[i], 0, bytesArray[i].length);
        bytes.set(i, cu);
    }
    for (int i = 0; i < writeCount; i++) {
        if (i == intervalRecordCount) {
            System.out.println("write position:" + writer.getLength());
        }
        writer.append(bytes);
    }
    writer.close();
    RCFileInputFormat inputFormat = new RCFileInputFormat();
    JobConf jonconf = new JobConf(cloneConf);
    jonconf.set("mapred.input.dir", testDir.toString());
    HiveConf.setLongVar(jonconf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, minSplitSize);
    InputSplit[] splits = inputFormat.getSplits(jonconf, splitNumber);
    assertEquals("splits length should be " + splitNumber, splits.length, splitNumber);
    int readCount = 0;
    for (int i = 0; i < splits.length; i++) {
        int previousReadCount = readCount;
        RecordReader rr = inputFormat.getRecordReader(splits[i], jonconf, Reporter.NULL);
        Object key = rr.createKey();
        Object value = rr.createValue();
        while (rr.next(key, value)) {
            readCount++;
        }
        rr.close();
        System.out.println("The " + i + "th split read " + (readCount - previousReadCount));
    }
    assertEquals("readCount should be equal to writeCount", writeCount, readCount);
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) RecordReader(org.apache.hadoop.mapred.RecordReader) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)

Aggregations

RecordReader (org.apache.hadoop.mapred.RecordReader)17 Path (org.apache.hadoop.fs.Path)9 FileSplit (org.apache.hadoop.mapred.FileSplit)8 IOException (java.io.IOException)7 JobConf (org.apache.hadoop.mapred.JobConf)7 InputSplit (org.apache.hadoop.mapred.InputSplit)6 FileSystem (org.apache.hadoop.fs.FileSystem)5 InputFormat (org.apache.hadoop.mapred.InputFormat)4 Text (org.apache.hadoop.io.Text)3 Configuration (org.apache.hadoop.conf.Configuration)2 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)2 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)2 SequenceFile (org.apache.hadoop.io.SequenceFile)2 Reporter (org.apache.hadoop.mapred.Reporter)2 PARTITION_KEY (com.facebook.presto.hive.HiveColumnHandle.ColumnType.PARTITION_KEY)1 REGULAR (com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR)1 HiveColumnHandle.bucketColumnHandle (com.facebook.presto.hive.HiveColumnHandle.bucketColumnHandle)1 HiveColumnHandle.isBucketColumnHandle (com.facebook.presto.hive.HiveColumnHandle.isBucketColumnHandle)1 HiveColumnHandle.isPathColumnHandle (com.facebook.presto.hive.HiveColumnHandle.isPathColumnHandle)1 HiveColumnHandle.pathColumnHandle (com.facebook.presto.hive.HiveColumnHandle.pathColumnHandle)1