Search in sources :

Example 41 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project voldemort by voldemort.

the class BdbBuildPerformanceTest method main.

public static void main(String[] args) throws FileNotFoundException, IOException {
    if (args.length != 3)
        Utils.croak("USAGE: java " + BdbBuildPerformanceTest.class.getName() + "serverPropsFile storeName jsonSequenceDataFile");
    String serverPropsFile = args[0];
    String storeName = args[1];
    String jsonDataFile = args[2];
    final Store<ByteArray, byte[], byte[]> store = new BdbStorageConfiguration(new VoldemortConfig(new Props(new File(serverPropsFile)))).getStore(TestUtils.makeStoreDefinition(storeName), TestUtils.makeSingleNodeRoutingStrategy());
    final AtomicInteger obsoletes = new AtomicInteger(0);
    Path jsonFilePath = new Path(jsonDataFile);
    FileStatus jsonFileStatus = jsonFilePath.getFileSystem(new Configuration()).listStatus(jsonFilePath)[0];
    final SequenceFileRecordReader<BytesWritable, BytesWritable> reader = new SequenceFileRecordReader<BytesWritable, BytesWritable>(new Configuration(), new FileSplit(jsonFilePath, 0, jsonFileStatus.getLen(), (String[]) null));
    PerformanceTest readWriteTest = new PerformanceTest() {

        @Override
        public void doOperation(int index) throws Exception {
            try {
                BytesWritable key = new BytesWritable();
                BytesWritable value = new BytesWritable();
                reader.next(key, value);
                store.put(new ByteArray(ByteUtils.copy(key.get(), 0, key.getSize())), Versioned.value(ByteUtils.copy(value.get(), 0, value.getSize())), null);
            } catch (ObsoleteVersionException e) {
                obsoletes.incrementAndGet();
            }
        }
    };
    readWriteTest.run(30 * 1000 * 1000, 1);
    System.out.println("Bdb write throuhput with one thread:");
    readWriteTest.printStats();
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) BdbStorageConfiguration(voldemort.store.bdb.BdbStorageConfiguration) Configuration(org.apache.hadoop.conf.Configuration) SequenceFileRecordReader(org.apache.hadoop.mapred.SequenceFileRecordReader) BytesWritable(org.apache.hadoop.io.BytesWritable) Props(voldemort.utils.Props) FileSplit(org.apache.hadoop.mapred.FileSplit) VoldemortConfig(voldemort.server.VoldemortConfig) ObsoleteVersionException(voldemort.versioning.ObsoleteVersionException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ByteArray(voldemort.utils.ByteArray) PerformanceTest(voldemort.performance.PerformanceTest) BdbStorageConfiguration(voldemort.store.bdb.BdbStorageConfiguration) File(java.io.File)

Example 42 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project voldemort by voldemort.

the class MysqlBuildPerformanceTest method main.

public static void main(String[] args) throws FileNotFoundException, IOException {
    if (args.length != 3)
        Utils.croak("USAGE: java " + MysqlBuildPerformanceTest.class.getName() + "serverPropsFile storeName jsonSequenceDataFile");
    String serverPropsFile = args[0];
    String storeName = args[1];
    String jsonDataFile = args[2];
    final Store<ByteArray, byte[], byte[]> store = new MysqlStorageConfiguration(new VoldemortConfig(new Props(new File(serverPropsFile)))).getStore(TestUtils.makeStoreDefinition(storeName), TestUtils.makeSingleNodeRoutingStrategy());
    final AtomicInteger obsoletes = new AtomicInteger(0);
    Path jsonFilePath = new Path(jsonDataFile);
    FileStatus jsonFileStatus = jsonFilePath.getFileSystem(new Configuration()).listStatus(jsonFilePath)[0];
    final SequenceFileRecordReader<BytesWritable, BytesWritable> reader = new SequenceFileRecordReader<BytesWritable, BytesWritable>(new Configuration(), new FileSplit(jsonFilePath, 0, jsonFileStatus.getLen(), (String[]) null));
    PerformanceTest readWriteTest = new PerformanceTest() {

        @Override
        public void doOperation(int index) throws Exception {
            try {
                BytesWritable key = new BytesWritable();
                BytesWritable value = new BytesWritable();
                reader.next(key, value);
                store.put(new ByteArray(ByteUtils.copy(key.get(), 0, key.getSize())), Versioned.value(ByteUtils.copy(value.get(), 0, value.getSize())), null);
            } catch (ObsoleteVersionException e) {
                obsoletes.incrementAndGet();
            }
        }
    };
    readWriteTest.run(1000, 1);
    System.out.println("MySQl write throuhput with one thread:");
    readWriteTest.printStats();
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) MysqlStorageConfiguration(voldemort.store.mysql.MysqlStorageConfiguration) Configuration(org.apache.hadoop.conf.Configuration) SequenceFileRecordReader(org.apache.hadoop.mapred.SequenceFileRecordReader) BytesWritable(org.apache.hadoop.io.BytesWritable) Props(voldemort.utils.Props) FileSplit(org.apache.hadoop.mapred.FileSplit) VoldemortConfig(voldemort.server.VoldemortConfig) MysqlStorageConfiguration(voldemort.store.mysql.MysqlStorageConfiguration) ObsoleteVersionException(voldemort.versioning.ObsoleteVersionException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ByteArray(voldemort.utils.ByteArray) PerformanceTest(voldemort.performance.PerformanceTest) File(java.io.File)

Example 43 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project voldemort by voldemort.

the class JsonSequenceFileInputFormat method getRecordReader.

@Override
public RecordReader<BytesWritable, BytesWritable> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException {
    String inputPathString = ((FileSplit) split).getPath().toUri().getPath();
    log.info("Input file path:" + inputPathString);
    Path inputPath = new Path(inputPathString);
    SequenceFile.Reader reader = new SequenceFile.Reader(inputPath.getFileSystem(conf), inputPath, conf);
    SequenceFile.Metadata meta = reader.getMetadata();
    try {
        Text keySchema = meta.get(new Text("key.schema"));
        Text valueSchema = meta.get(new Text("value.schema"));
        if (0 == keySchema.getLength() || 0 == valueSchema.getLength()) {
            throw new Exception();
        }
        // update Joboconf with schemas
        conf.set("mapper.input.key.schema", keySchema.toString());
        conf.set("mapper.input.value.schema", valueSchema.toString());
    } catch (Exception e) {
        throw new IOException("Failed to Load Schema from file:" + inputPathString + "\n");
    }
    return super.getRecordReader(split, conf, reporter);
}
Also used : Path(org.apache.hadoop.fs.Path) SequenceFile(org.apache.hadoop.io.SequenceFile) RecordReader(org.apache.hadoop.mapred.RecordReader) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapred.FileSplit) IOException(java.io.IOException)

Example 44 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project mongo-hadoop by mongodb.

the class HiveMongoInputFormat method getSplits.

@Override
public FileSplit[] getSplits(final JobConf conf, final int numSplits) throws IOException {
    try {
        MongoSplitter splitterImpl = MongoSplitterFactory.getSplitter(conf);
        final List<org.apache.hadoop.mapreduce.InputSplit> splits = splitterImpl.calculateSplits();
        InputSplit[] splitIns = splits.toArray(new InputSplit[splits.size()]);
        // wrap InputSplits in FileSplits so that 'getPath' 
        // doesn't produce an error (Hive bug)
        FileSplit[] wrappers = new FileSplit[splitIns.length];
        Path path = new Path(conf.get(MongoStorageHandler.TABLE_LOCATION));
        for (int i = 0; i < wrappers.length; i++) {
            wrappers[i] = new MongoHiveInputSplit(splitIns[i], path);
        }
        return wrappers;
    } catch (SplitFailedException spfe) {
        // split failed because no namespace found 
        // (so the corresponding collection doesn't exist)
        LOG.error(spfe.getMessage(), spfe);
        throw new IOException(spfe.getMessage(), spfe);
    } catch (Exception e) {
        throw new IOException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MongoSplitter(com.mongodb.hadoop.splitter.MongoSplitter) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapred.FileSplit) SplitFailedException(com.mongodb.hadoop.splitter.SplitFailedException) IOException(java.io.IOException) SplitFailedException(com.mongodb.hadoop.splitter.SplitFailedException) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 45 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project mongo-hadoop by mongodb.

the class BSONFileInputFormat method getSplits.

@Override
public FileSplit[] getSplits(final JobConf job, final int numSplits) throws IOException {
    BSONSplitter splitter = new BSONSplitter();
    splitter.setConf(job);
    FileStatus[] inputFiles = listStatus(job);
    List<FileSplit> results = new ArrayList<FileSplit>();
    for (FileStatus file : inputFiles) {
        FileSystem fs = FileSystem.get(file.getPath().toUri(), job);
        if (!isSplitable(fs, file.getPath())) {
            LOG.info("File " + file.getPath() + " is compressed so " + "cannot be split.");
            org.apache.hadoop.mapreduce.lib.input.FileSplit delegate = splitter.createFileSplit(file, fs, 0L, file.getLen());
            results.add(new BSONFileSplit(delegate.getPath(), delegate.getStart(), delegate.getLength(), delegate.getLocations()));
            continue;
        }
        splitter.setInputPath(file.getPath());
        Path splitFilePath = getSplitsFilePath(file.getPath(), job);
        try {
            splitter.loadSplitsFromSplitFile(file, splitFilePath);
        } catch (BSONSplitter.NoSplitFileException nsfe) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(format("No split file for %s; building split file", file.getPath()));
            }
            splitter.readSplitsForFile(file);
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug(format("BSONSplitter found %d splits.", splitter.getAllSplits().size()));
        }
        for (org.apache.hadoop.mapreduce.lib.input.FileSplit split : splitter.getAllSplits()) {
            BSONFileSplit fsplit = new BSONFileSplit(split.getPath(), split.getStart(), split.getLength(), split.getLocations());
            fsplit.setKeyField(MongoConfigUtil.getInputKey(job));
            results.add(fsplit);
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug(format("Total of %d found.", results.size()));
    }
    return results.toArray(new BSONFileSplit[results.size()]);
}
Also used : BSONSplitter.getSplitsFilePath(com.mongodb.hadoop.splitter.BSONSplitter.getSplitsFilePath) Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) BSONFileSplit(com.mongodb.hadoop.mapred.input.BSONFileSplit) BSONSplitter(com.mongodb.hadoop.splitter.BSONSplitter) BSONFileSplit(com.mongodb.hadoop.mapred.input.BSONFileSplit) FileSplit(org.apache.hadoop.mapred.FileSplit) FileSystem(org.apache.hadoop.fs.FileSystem)

Aggregations

FileSplit (org.apache.hadoop.mapred.FileSplit)66 Path (org.apache.hadoop.fs.Path)38 InputSplit (org.apache.hadoop.mapred.InputSplit)23 JobConf (org.apache.hadoop.mapred.JobConf)16 File (java.io.File)10 IOException (java.io.IOException)10 Configuration (org.apache.hadoop.conf.Configuration)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 FileSystem (org.apache.hadoop.fs.FileSystem)10 Test (org.junit.Test)9 RecordReader (org.apache.hadoop.mapred.RecordReader)8 ArrayList (java.util.ArrayList)7 Properties (java.util.Properties)7 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)7 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)5 NullWritable (org.apache.hadoop.io.NullWritable)5 InputFormat (org.apache.hadoop.mapred.InputFormat)4 NodeControllerInfo (org.apache.hyracks.api.client.NodeControllerInfo)4 ClusterTopology (org.apache.hyracks.api.topology.ClusterTopology)4 VertexLocationHint (org.apache.tez.dag.api.VertexLocationHint)4