Search in sources :

Example 41 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hive by apache.

the class TestRCFileMapReduceInputFormat method writeThenReadByRecordReader.

private void writeThenReadByRecordReader(int intervalRecordCount, int writeCount, int splitNumber, long maxSplitSize, CompressionCodec codec) throws IOException, InterruptedException {
    Path testDir = new Path(System.getProperty("test.tmp.dir", ".") + "/mapred/testsmallfirstsplit");
    Path testFile = new Path(testDir, "test_rcfile");
    fs.delete(testFile, true);
    Configuration cloneConf = new Configuration(conf);
    RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length);
    cloneConf.setInt(HiveConf.ConfVars.HIVE_RCFILE_RECORD_INTERVAL.varname, intervalRecordCount);
    RCFile.Writer writer = new RCFile.Writer(fs, cloneConf, testFile, null, codec);
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(bytesArray.length);
    for (int i = 0; i < bytesArray.length; i++) {
        BytesRefWritable cu = null;
        cu = new BytesRefWritable(bytesArray[i], 0, bytesArray[i].length);
        bytes.set(i, cu);
    }
    for (int i = 0; i < writeCount; i++) {
        writer.append(bytes);
    }
    writer.close();
    RCFileMapReduceInputFormat<LongWritable, BytesRefArrayWritable> inputFormat = new RCFileMapReduceInputFormat<LongWritable, BytesRefArrayWritable>();
    Configuration jonconf = new Configuration(cloneConf);
    jonconf.set("mapred.input.dir", testDir.toString());
    JobContext context = new Job(jonconf);
    HiveConf.setLongVar(context.getConfiguration(), HiveConf.ConfVars.MAPREDMAXSPLITSIZE, maxSplitSize);
    List<InputSplit> splits = inputFormat.getSplits(context);
    assertEquals("splits length should be " + splitNumber, splits.size(), splitNumber);
    int readCount = 0;
    for (int i = 0; i < splits.size(); i++) {
        TaskAttemptContext tac = ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptContext(jonconf, new TaskAttemptID());
        RecordReader<LongWritable, BytesRefArrayWritable> rr = inputFormat.createRecordReader(splits.get(i), tac);
        rr.initialize(splits.get(i), tac);
        while (rr.nextKeyValue()) {
            readCount++;
        }
    }
    assertEquals("readCount should be equal to writeCount", readCount, writeCount);
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) RCFile(org.apache.hadoop.hive.ql.io.RCFile) LongWritable(org.apache.hadoop.io.LongWritable) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)

Example 42 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project crunch by cloudera.

the class CrunchInputSplit method readFields.

public void readFields(DataInput in) throws IOException {
    nodeIndex = in.readInt();
    int extraConfSize = in.readInt();
    if (extraConfSize > 0) {
        for (int i = 0; i < extraConfSize; i++) {
            conf.set(in.readUTF(), in.readUTF());
        }
    }
    inputFormatClass = (Class<? extends InputFormat<?, ?>>) readClass(in);
    Class<? extends InputSplit> inputSplitClass = (Class<? extends InputSplit>) readClass(in);
    inputSplit = (InputSplit) ReflectionUtils.newInstance(inputSplitClass, conf);
    SerializationFactory factory = new SerializationFactory(conf);
    Deserializer deserializer = factory.getDeserializer(inputSplitClass);
    deserializer.open((DataInputStream) in);
    inputSplit = (InputSplit) deserializer.deserialize(inputSplit);
}
Also used : Deserializer(org.apache.hadoop.io.serializer.Deserializer) SerializationFactory(org.apache.hadoop.io.serializer.SerializationFactory) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 43 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project druid by druid-io.

the class DatasourceRecordReaderTest method testSanity.

@Test
public void testSanity() throws Exception {
    DataSegment segment = new DefaultObjectMapper().readValue(this.getClass().getClassLoader().getResource("test-segment/descriptor.json"), DataSegment.class).withLoadSpec(ImmutableMap.<String, Object>of("type", "local", "path", this.getClass().getClassLoader().getResource("test-segment/index.zip").getPath()));
    InputSplit split = new DatasourceInputSplit(Lists.newArrayList(WindowedDataSegment.of(segment)), null);
    Configuration config = new Configuration();
    config.set(DatasourceInputFormat.CONF_DRUID_SCHEMA, HadoopDruidIndexerConfig.JSON_MAPPER.writeValueAsString(new DatasourceIngestionSpec(segment.getDataSource(), segment.getInterval(), null, null, null, null, segment.getDimensions(), segment.getMetrics(), false)));
    TaskAttemptContext context = EasyMock.createNiceMock(TaskAttemptContext.class);
    EasyMock.expect(context.getConfiguration()).andReturn(config).anyTimes();
    EasyMock.replay(context);
    DatasourceRecordReader rr = new DatasourceRecordReader();
    rr.initialize(split, context);
    Assert.assertEquals(0, rr.getProgress(), 0.0001);
    List<InputRow> rows = Lists.newArrayList();
    while (rr.nextKeyValue()) {
        rows.add(rr.getCurrentValue());
    }
    verifyRows(rows);
    Assert.assertEquals(1, rr.getProgress(), 0.0001);
    rr.close();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) InputRow(io.druid.data.input.InputRow) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) DefaultObjectMapper(io.druid.jackson.DefaultObjectMapper) DataSegment(io.druid.timeline.DataSegment) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 44 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project mongo-hadoop by mongodb.

the class GridFSInputFormat method getSplits.

@Override
public List<InputSplit> getSplits(final JobContext context) throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    DBCollection inputCollection = MongoConfigUtil.getInputCollection(conf);
    MongoClientURI inputURI = MongoConfigUtil.getInputURI(conf);
    GridFS gridFS = new GridFS(inputCollection.getDB(), inputCollection.getName());
    DBObject query = MongoConfigUtil.getQuery(conf);
    List<InputSplit> splits = new LinkedList<InputSplit>();
    for (GridFSDBFile file : gridFS.find(query)) {
        // One split per file.
        if (MongoConfigUtil.isGridFSWholeFileSplit(conf)) {
            splits.add(new GridFSSplit(inputURI, (ObjectId) file.getId(), (int) file.getChunkSize(), file.getLength()));
        } else // One split per file chunk.
        {
            for (int chunk = 0; chunk < file.numChunks(); ++chunk) {
                splits.add(new GridFSSplit(inputURI, (ObjectId) file.getId(), (int) file.getChunkSize(), file.getLength(), chunk));
            }
        }
    }
    LOG.debug("Found GridFS splits: " + splits);
    return splits;
}
Also used : DBCollection(com.mongodb.DBCollection) GridFSSplit(com.mongodb.hadoop.input.GridFSSplit) Configuration(org.apache.hadoop.conf.Configuration) ObjectId(org.bson.types.ObjectId) GridFSDBFile(com.mongodb.gridfs.GridFSDBFile) MongoClientURI(com.mongodb.MongoClientURI) GridFS(com.mongodb.gridfs.GridFS) DBObject(com.mongodb.DBObject) InputSplit(org.apache.hadoop.mapreduce.InputSplit) LinkedList(java.util.LinkedList)

Example 45 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project mongo-hadoop by mongodb.

the class StandaloneMongoSplitter method calculateSplits.

@Override
public List<InputSplit> calculateSplits() throws SplitFailedException {
    final DBObject splitKey = MongoConfigUtil.getInputSplitKey(getConfiguration());
    final DBObject splitKeyMax = MongoConfigUtil.getMaxSplitKey(getConfiguration());
    final DBObject splitKeyMin = MongoConfigUtil.getMinSplitKey(getConfiguration());
    final int splitSize = MongoConfigUtil.getSplitSize(getConfiguration());
    final MongoClientURI inputURI;
    DBCollection inputCollection = null;
    final ArrayList<InputSplit> returnVal;
    try {
        inputURI = MongoConfigUtil.getInputURI(getConfiguration());
        MongoClientURI authURI = MongoConfigUtil.getAuthURI(getConfiguration());
        if (authURI != null) {
            inputCollection = MongoConfigUtil.getCollectionWithAuth(inputURI, authURI);
        } else {
            inputCollection = MongoConfigUtil.getCollection(inputURI);
        }
        returnVal = new ArrayList<InputSplit>();
        final String ns = inputCollection.getFullName();
        if (LOG.isDebugEnabled()) {
            LOG.debug(String.format("Running splitVector on namespace: %s.%s; hosts: %s", inputURI.getDatabase(), inputURI.getCollection(), inputURI.getHosts()));
        }
        final DBObject cmd = BasicDBObjectBuilder.start("splitVector", ns).add("keyPattern", splitKey).add("min", splitKeyMin).add("max", splitKeyMax).add("force", false).add("maxChunkSize", splitSize).get();
        CommandResult data;
        boolean ok = true;
        try {
            data = inputCollection.getDB().getSisterDB(inputURI.getDatabase()).command(cmd, ReadPreference.primary());
        } catch (final MongoException e) {
            // 2.0 servers throw exceptions rather than info in a CommandResult
            data = null;
            LOG.info(e.getMessage(), e);
            if (e.getMessage().contains("unrecognized command: splitVector")) {
                ok = false;
            } else {
                throw e;
            }
        }
        if (data != null) {
            if (data.containsField("$err")) {
                throw new SplitFailedException("Error calculating splits: " + data);
            } else if (!data.get("ok").equals(1.0)) {
                ok = false;
            }
        }
        if (!ok) {
            final CommandResult stats = inputCollection.getStats();
            if (stats.containsField("primary")) {
                final DBCursor shards = inputCollection.getDB().getSisterDB("config").getCollection("shards").find(new BasicDBObject("_id", stats.getString("primary")));
                try {
                    if (shards.hasNext()) {
                        final DBObject shard = shards.next();
                        final String host = ((String) shard.get("host")).replace(shard.get("_id") + "/", "");
                        final MongoClientURI shardHost;
                        if (authURI != null) {
                            shardHost = new MongoClientURIBuilder(authURI).host(host).build();
                        } else {
                            shardHost = new MongoClientURIBuilder(inputURI).host(host).build();
                        }
                        MongoClient shardClient = null;
                        try {
                            shardClient = new MongoClient(shardHost);
                            data = shardClient.getDB(shardHost.getDatabase()).command(cmd, ReadPreference.primary());
                        } catch (final Exception e) {
                            LOG.error(e.getMessage(), e);
                        } finally {
                            if (shardClient != null) {
                                shardClient.close();
                            }
                        }
                    }
                } finally {
                    shards.close();
                }
            }
            if (data != null && !data.get("ok").equals(1.0)) {
                throw new SplitFailedException("Unable to calculate input splits: " + data.get("errmsg"));
            }
        }
        // Comes in a format where "min" and "max" are implicit
        // and each entry is just a boundary key; not ranged
        final BasicDBList splitData = (BasicDBList) data.get("splitKeys");
        if (splitData.size() == 0) {
            LOG.warn("WARNING: No Input Splits were calculated by the split code. Proceeding with a *single* split. Data may be too" + " small, try lowering 'mongo.input.split_size' if this is undesirable.");
        }
        // Lower boundary of the first min split
        BasicDBObject lastKey = null;
        // If splitKeyMin was given, use it as first boundary.
        if (!splitKeyMin.toMap().isEmpty()) {
            lastKey = new BasicDBObject(splitKeyMin.toMap());
        }
        for (final Object aSplitData : splitData) {
            final BasicDBObject currentKey = (BasicDBObject) aSplitData;
            returnVal.add(createSplitFromBounds(lastKey, currentKey));
            lastKey = currentKey;
        }
        BasicDBObject maxKey = null;
        // If splitKeyMax was given, use it as last boundary.
        if (!splitKeyMax.toMap().isEmpty()) {
            maxKey = new BasicDBObject(splitKeyMax.toMap());
        }
        // Last max split
        final MongoInputSplit lastSplit = createSplitFromBounds(lastKey, maxKey);
        returnVal.add(lastSplit);
    } finally {
        if (inputCollection != null) {
            MongoConfigUtil.close(inputCollection.getDB().getMongo());
        }
    }
    if (MongoConfigUtil.isFilterEmptySplitsEnabled(getConfiguration())) {
        return filterEmptySplits(returnVal);
    }
    return returnVal;
}
Also used : MongoException(com.mongodb.MongoException) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) MongoClientURI(com.mongodb.MongoClientURI) BasicDBObject(com.mongodb.BasicDBObject) DBObject(com.mongodb.DBObject) MongoException(com.mongodb.MongoException) CommandResult(com.mongodb.CommandResult) DBCollection(com.mongodb.DBCollection) BasicDBObject(com.mongodb.BasicDBObject) MongoClient(com.mongodb.MongoClient) BasicDBList(com.mongodb.BasicDBList) DBCursor(com.mongodb.DBCursor) MongoClientURIBuilder(com.mongodb.hadoop.util.MongoClientURIBuilder) BasicDBObject(com.mongodb.BasicDBObject) DBObject(com.mongodb.DBObject) InputSplit(org.apache.hadoop.mapreduce.InputSplit) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit)

Aggregations

InputSplit (org.apache.hadoop.mapreduce.InputSplit)160 Configuration (org.apache.hadoop.conf.Configuration)70 Test (org.junit.Test)68 ArrayList (java.util.ArrayList)51 Path (org.apache.hadoop.fs.Path)43 Job (org.apache.hadoop.mapreduce.Job)42 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)38 IOException (java.io.IOException)33 JobContext (org.apache.hadoop.mapreduce.JobContext)20 LongWritable (org.apache.hadoop.io.LongWritable)19 FileSystem (org.apache.hadoop.fs.FileSystem)16 MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)14 MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)13 List (java.util.List)13 Text (org.apache.hadoop.io.Text)13 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)13 DBObject (com.mongodb.DBObject)10 File (java.io.File)10 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)10 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)9