Search in sources :

Example 1 with RecordReader

use of org.apache.hadoop.mapreduce.RecordReader in project hadoop by apache.

the class TeraInputFormat method writePartitionFile.

/**
   * Use the input splits to take samples of the input and generate sample
   * keys. By default reads 100,000 keys from 10 locations in the input, sorts
   * them and picks N-1 keys to generate N equally sized partitions.
   * @param job the job to sample
   * @param partFile where to write the output file to
   * @throws Throwable if something goes wrong
   */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(TeraSortConfigKeys.SAMPLE_SIZE.key(), TeraSortConfigKeys.DEFAULT_SAMPLE_SIZE);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(TeraSortConfigKeys.NUM_PARTITIONS.key(), TeraSortConfigKeys.DEFAULT_NUM_PARTITIONS), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {

            {
                setDaemon(true);
            }

            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println("Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {
                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) DataOutputStream(java.io.DataOutputStream) RecordReader(org.apache.hadoop.mapreduce.RecordReader) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 2 with RecordReader

use of org.apache.hadoop.mapreduce.RecordReader in project carbondata by apache.

the class CarbonReaderBuilder method build.

public <T> CarbonReader<T> build() throws IOException, InterruptedException {
    CarbonTable table = CarbonTable.buildFromTablePath("_temp", tablePath);
    final CarbonFileInputFormat format = new CarbonFileInputFormat();
    final Job job = new Job(new Configuration());
    format.setTableInfo(job.getConfiguration(), table.getTableInfo());
    format.setTablePath(job.getConfiguration(), table.getTablePath());
    format.setTableName(job.getConfiguration(), table.getTableName());
    format.setDatabaseName(job.getConfiguration(), table.getDatabaseName());
    if (filterExpression != null) {
        format.setFilterPredicates(job.getConfiguration(), filterExpression);
    }
    if (projectionColumns != null) {
        format.setColumnProjection(job.getConfiguration(), new CarbonProjection(projectionColumns));
    }
    final List<InputSplit> splits = format.getSplits(new JobContextImpl(job.getConfiguration(), new JobID()));
    List<RecordReader<Void, T>> readers = new ArrayList<>(splits.size());
    for (InputSplit split : splits) {
        TaskAttemptContextImpl attempt = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
        RecordReader reader = format.createRecordReader(split, attempt);
        reader.initialize(split, attempt);
        readers.add(reader);
    }
    return new CarbonReader<>(readers);
}
Also used : JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) RecordReader(org.apache.hadoop.mapreduce.RecordReader) ArrayList(java.util.ArrayList) CarbonFileInputFormat(org.apache.carbondata.hadoop.api.CarbonFileInputFormat) CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) CarbonProjection(org.apache.carbondata.hadoop.CarbonProjection) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) JobID(org.apache.hadoop.mapreduce.JobID)

Example 3 with RecordReader

use of org.apache.hadoop.mapreduce.RecordReader in project carbondata by apache.

the class CarbonStreamInputFormatTest method testCreateRecordReader.

@Test
public void testCreateRecordReader() {
    try {
        InputSplit inputSplit = buildInputSplit();
        CarbonStreamInputFormat inputFormat = new CarbonStreamInputFormat();
        RecordReader recordReader = inputFormat.createRecordReader(inputSplit, taskAttemptContext);
        Assert.assertNotNull("Failed to create record reader", recordReader);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.assertTrue(e.getMessage(), false);
    }
}
Also used : RecordReader(org.apache.hadoop.mapreduce.RecordReader) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) IOException(java.io.IOException) Test(org.junit.Test)

Example 4 with RecordReader

use of org.apache.hadoop.mapreduce.RecordReader in project hbase by apache.

the class MultiTableInputFormatBase method createRecordReader.

/**
 * Builds a TableRecordReader. If no TableRecordReader was provided, uses the
 * default.
 *
 * @param split The split to work with.
 * @param context The current context.
 * @return The newly created record reader.
 * @throws IOException When creating the reader fails.
 * @throws InterruptedException when record reader initialization fails
 * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(
 *      org.apache.hadoop.mapreduce.InputSplit,
 *      org.apache.hadoop.mapreduce.TaskAttemptContext)
 */
@Override
public RecordReader<ImmutableBytesWritable, Result> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    TableSplit tSplit = (TableSplit) split;
    LOG.info(MessageFormat.format("Input split length: {0} bytes.", tSplit.getLength()));
    if (tSplit.getTable() == null) {
        throw new IOException("Cannot create a record reader because of a" + " previous error. Please look at the previous logs lines from" + " the task's full log for more details.");
    }
    final Connection connection = ConnectionFactory.createConnection(context.getConfiguration());
    Table table = connection.getTable(tSplit.getTable());
    if (this.tableRecordReader == null) {
        this.tableRecordReader = new TableRecordReader();
    }
    final TableRecordReader trr = this.tableRecordReader;
    try {
        Scan sc = tSplit.getScan();
        sc.withStartRow(tSplit.getStartRow());
        sc.withStopRow(tSplit.getEndRow());
        trr.setScan(sc);
        trr.setTable(table);
        return new RecordReader<ImmutableBytesWritable, Result>() {

            @Override
            public void close() throws IOException {
                trr.close();
                connection.close();
            }

            @Override
            public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException {
                return trr.getCurrentKey();
            }

            @Override
            public Result getCurrentValue() throws IOException, InterruptedException {
                return trr.getCurrentValue();
            }

            @Override
            public float getProgress() throws IOException, InterruptedException {
                return trr.getProgress();
            }

            @Override
            public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException, InterruptedException {
                trr.initialize(inputsplit, context);
            }

            @Override
            public boolean nextKeyValue() throws IOException, InterruptedException {
                return trr.nextKeyValue();
            }
        };
    } catch (IOException ioe) {
        // If there is an exception make sure that all
        // resources are closed and released.
        trr.close();
        connection.close();
        throw ioe;
    }
}
Also used : Table(org.apache.hadoop.hbase.client.Table) RecordReader(org.apache.hadoop.mapreduce.RecordReader) Connection(org.apache.hadoop.hbase.client.Connection) Scan(org.apache.hadoop.hbase.client.Scan) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 5 with RecordReader

use of org.apache.hadoop.mapreduce.RecordReader in project druid by druid-io.

the class BaseParquetInputTest method getAllRows.

static List<InputRow> getAllRows(String parserType, HadoopDruidIndexerConfig config) throws IOException, InterruptedException {
    Job job = Job.getInstance(new Configuration());
    config.intoConfiguration(job);
    File testFile = new File(((StaticPathSpec) config.getPathSpec()).getPaths());
    Path path = new Path(testFile.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile.length(), null);
    InputFormat inputFormat = ReflectionUtils.newInstance(INPUT_FORMAT_CLASSES.get(parserType), job.getConfiguration());
    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
    try (RecordReader reader = inputFormat.createRecordReader(split, context)) {
        List<InputRow> records = new ArrayList<>();
        InputRowParser parser = config.getParser();
        reader.initialize(split, context);
        while (reader.nextKeyValue()) {
            reader.nextKeyValue();
            Object data = reader.getCurrentValue();
            records.add(((List<InputRow>) parser.parseBatch(data)).get(0));
        }
        return records;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) RecordReader(org.apache.hadoop.mapreduce.RecordReader) ArrayList(java.util.ArrayList) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) InputFormat(org.apache.hadoop.mapreduce.InputFormat) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) InputRow(org.apache.druid.data.input.InputRow) InputRowParser(org.apache.druid.data.input.impl.InputRowParser) Job(org.apache.hadoop.mapreduce.Job) File(java.io.File)

Aggregations

RecordReader (org.apache.hadoop.mapreduce.RecordReader)24 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)17 Configuration (org.apache.hadoop.conf.Configuration)13 InputSplit (org.apache.hadoop.mapreduce.InputSplit)13 TaskAttemptID (org.apache.hadoop.mapreduce.TaskAttemptID)11 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)11 InputFormat (org.apache.hadoop.mapreduce.InputFormat)9 Job (org.apache.hadoop.mapreduce.Job)8 Test (org.junit.Test)8 ArrayList (java.util.ArrayList)7 Path (org.apache.hadoop.fs.Path)7 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)6 IOException (java.io.IOException)4 File (java.io.File)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 Mapper (org.apache.hadoop.mapreduce.Mapper)3 WrappedMapper (org.apache.hadoop.mapreduce.lib.map.WrappedMapper)3 Scan (org.apache.hadoop.hbase.client.Scan)2 RecordWriter (org.apache.hadoop.mapreduce.RecordWriter)2 JobContextImpl (org.apache.hadoop.mapreduce.task.JobContextImpl)2