Search in sources :

Example 21 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class StreamingAssert method readRecords.

List<Record> readRecords() throws Exception {
    if (currentDeltas.isEmpty()) {
        throw new AssertionError("No data");
    }
    InputFormat<NullWritable, OrcStruct> inputFormat = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.input.dir", partitionLocation.toString());
    job.set("bucket_count", Integer.toString(table.getSd().getNumBuckets()));
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg");
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string");
    job.set(ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN.varname, "true");
    job.set(ValidTxnList.VALID_TXNS_KEY, txns.toString());
    InputSplit[] splits = inputFormat.getSplits(job, 1);
    assertEquals(1, splits.length);
    final AcidRecordReader<NullWritable, OrcStruct> recordReader = (AcidRecordReader<NullWritable, OrcStruct>) inputFormat.getRecordReader(splits[0], job, Reporter.NULL);
    NullWritable key = recordReader.createKey();
    OrcStruct value = recordReader.createValue();
    List<Record> records = new ArrayList<>();
    while (recordReader.next(key, value)) {
        RecordIdentifier recordIdentifier = recordReader.getRecordIdentifier();
        Record record = new Record(new RecordIdentifier(recordIdentifier.getTransactionId(), recordIdentifier.getBucketId(), recordIdentifier.getRowId()), value.toString());
        System.out.println(record);
        records.add(record);
    }
    recordReader.close();
    return records;
}
Also used : ArrayList(java.util.ArrayList) AcidRecordReader(org.apache.hadoop.hive.ql.io.AcidInputFormat.AcidRecordReader) NullWritable(org.apache.hadoop.io.NullWritable) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) OrcStruct(org.apache.hadoop.hive.ql.io.orc.OrcStruct) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 22 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class JdbcInputFormatTest method testSplitLogic_noSpillOver.

@Test
public void testSplitLogic_noSpillOver() throws HiveJdbcDatabaseAccessException, IOException {
    JdbcInputFormat f = new JdbcInputFormat();
    when(mockDatabaseAccessor.getTotalNumberOfRecords(any(Configuration.class))).thenReturn(15);
    f.setDbAccessor(mockDatabaseAccessor);
    JobConf conf = new JobConf();
    conf.set("mapred.input.dir", "/temp");
    InputSplit[] splits = f.getSplits(conf, 3);
    assertThat(splits, is(notNullValue()));
    assertThat(splits.length, is(3));
    assertThat(splits[0].getLength(), is(5L));
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Test(org.junit.Test)

Example 23 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project asterixdb by apache.

the class HDFSCluster method main.

public static void main(String[] args) throws Exception {
    HDFSCluster cluster = new HDFSCluster();
    cluster.setup();
    JobConf conf = configureJobConf();
    InputSplit[] inputSplits = conf.getInputFormat().getSplits(conf, 0);
    for (InputSplit split : inputSplits) {
        System.out.println("split :" + split);
    }
}
Also used : JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 24 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project asterixdb by apache.

the class IndexingScheduler method buildPopularityMap.

/**
     * Scan the splits once and build a popularity map
     *
     * @param splits
     *            the split array
     * @param locationToNumOfSplits
     *            the map to be built
     * @throws IOException
     */
private void buildPopularityMap(InputSplit[] splits, Map<String, IntWritable> locationToNumOfSplits) throws IOException {
    for (InputSplit split : splits) {
        String[] locations = split.getLocations();
        for (String loc : locations) {
            IntWritable locCount = locationToNumOfSplits.get(loc);
            if (locCount == null) {
                locCount = new IntWritable(0);
                locationToNumOfSplits.put(loc, locCount);
            }
            locCount.set(locCount.get() + 1);
        }
    }
}
Also used : InputSplit(org.apache.hadoop.mapred.InputSplit) IntWritable(org.apache.hadoop.io.IntWritable)

Example 25 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project asterixdb by apache.

the class HDFSDataSourceFactory method configure.

@Override
public void configure(IServiceContext serviceCtx, Map<String, String> configuration) throws AsterixException {
    try {
        this.serviceCtx = serviceCtx;
        this.configuration = configuration;
        init((ICCServiceContext) serviceCtx);
        JobConf conf = HDFSUtils.configureHDFSJobConf(configuration);
        confFactory = new ConfFactory(conf);
        clusterLocations = getPartitionConstraint();
        int numPartitions = clusterLocations.getLocations().length;
        // if files list was set, we restrict the splits to the list
        InputSplit[] inputSplits;
        if (files == null) {
            inputSplits = conf.getInputFormat().getSplits(conf, numPartitions);
        } else {
            inputSplits = HDFSUtils.getSplits(conf, files);
        }
        if (indexingOp) {
            readSchedule = indexingScheduler.getLocationConstraints(inputSplits);
        } else {
            readSchedule = hdfsScheduler.getLocationConstraints(inputSplits);
        }
        inputSplitsFactory = new InputSplitsFactory(inputSplits);
        read = new boolean[readSchedule.length];
        Arrays.fill(read, false);
        String formatString = configuration.get(ExternalDataConstants.KEY_FORMAT);
        if (formatString == null || formatString.equals(ExternalDataConstants.FORMAT_HDFS_WRITABLE)) {
            RecordReader<?, ?> reader = conf.getInputFormat().getRecordReader(inputSplits[0], conf, Reporter.NULL);
            this.recordClass = reader.createValue().getClass();
            reader.close();
        } else {
            recordReaderClazz = StreamRecordReaderProvider.getRecordReaderClazz(configuration);
            this.recordClass = char[].class;
        }
    } catch (IOException e) {
        throw new AsterixException(e);
    }
}
Also used : ConfFactory(org.apache.hyracks.hdfs.dataflow.ConfFactory) AsterixException(org.apache.asterix.common.exceptions.AsterixException) IOException(java.io.IOException) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) InputSplitsFactory(org.apache.hyracks.hdfs.dataflow.InputSplitsFactory) AlgebricksAbsolutePartitionConstraint(org.apache.hyracks.algebricks.common.constraints.AlgebricksAbsolutePartitionConstraint)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)161 Path (org.apache.hadoop.fs.Path)57 JobConf (org.apache.hadoop.mapred.JobConf)56 Test (org.junit.Test)49 IOException (java.io.IOException)47 ArrayList (java.util.ArrayList)29 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)27 FileSplit (org.apache.hadoop.mapred.FileSplit)24 FileSystem (org.apache.hadoop.fs.FileSystem)21 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)21 InputFormat (org.apache.hadoop.mapred.InputFormat)19 RecordWriter (org.apache.hadoop.mapred.RecordWriter)19 NullWritable (org.apache.hadoop.io.NullWritable)18 Text (org.apache.hadoop.io.Text)18 Configuration (org.apache.hadoop.conf.Configuration)14 LongWritable (org.apache.hadoop.io.LongWritable)11 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)10 Properties (java.util.Properties)9 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)9 HashMap (java.util.HashMap)8