Examples with SequenceFileRecordReader - org.apache.hadoop.mapred.SequenceFileRecordReader

Example 1 with SequenceFileRecordReader

use of org.apache.hadoop.mapred.SequenceFileRecordReader in project incubator-systemml by apache.

the class SamplingSortMRInputFormat method writePartitionFile.

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 *
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @return index value
 * @throws IOException if something goes wrong
 * @throws InstantiationException if InstantiationException occurs
 * @throws IllegalAccessException if IllegalAccessException occurs
 */
@SuppressWarnings({ "unchecked", "unused", "deprecation" })
public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException {
    SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat();
    Sampler sampler = new Sampler();
    Class<? extends WritableComparable> targetKeyClass;
    targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class);
    // get input converter information
    int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0);
    int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0);
    // indicate whether the matrix value in this mapper is a matrix cell or a matrix block
    int partitions = conf.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 1000);
    InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
    int samples = Math.min(10, splits.length);
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.length / samples;
    // take N samples from different parts of the input
    int totalcount = 0;
    for (int i = 0; i < samples; i++) {
        SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat.getRecordReader(splits[sampleStep * i], conf, null);
        int count = 0;
        WritableComparable key = (WritableComparable) reader.createKey();
        Writable value = (Writable) reader.createValue();
        while (reader.next(key, value) && count < recordsPerSample) {
            Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0);
            inputConverter.setBlockSize(brlen, bclen);
            inputConverter.convert(key, value);
            while (inputConverter.hasNext()) {
                Pair pair = inputConverter.next();
                if (pair.getKey() instanceof DoubleWritable) {
                    sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get()));
                } else if (pair.getValue() instanceof MatrixCell) {
                    sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue()));
                } else
                    throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass());
                count++;
            }
            key = (WritableComparable) reader.createKey();
            value = (Writable) reader.createValue();
        }
        totalcount += count;
    }
    if (// empty input files
    totalcount == 0)
        sampler.addValue(new DoubleWritable(0));
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile)) {
        outFs.delete(partFile, false);
    }
    // note: key value always double/null as expected by partitioner
    SequenceFile.Writer writer = null;
    int index0 = -1;
    try {
        writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class);
        NullWritable nullValue = NullWritable.get();
        int i = 0;
        boolean lessthan0 = true;
        for (WritableComparable splitValue : sampler.createPartitions(partitions)) {
            writer.append(splitValue, nullValue);
            if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) {
                index0 = i;
                lessthan0 = false;
            }
            i++;
        }
        if (lessthan0)
            index0 = partitions - 1;
    } finally {
        IOUtilFunctions.closeSilently(writer);
    }
    return index0;
}

Also used : SequenceFileRecordReader(org.apache.hadoop.mapred.SequenceFileRecordReader) NullWritable(org.apache.hadoop.io.NullWritable) Writable(org.apache.hadoop.io.Writable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) IOException(java.io.IOException) NullWritable(org.apache.hadoop.io.NullWritable) SequenceFile(org.apache.hadoop.io.SequenceFile) WritableComparable(org.apache.hadoop.io.WritableComparable) FileSystem(org.apache.hadoop.fs.FileSystem) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Converter(org.apache.sysml.runtime.matrix.data.Converter) InputSplit(org.apache.hadoop.mapred.InputSplit) Pair(org.apache.sysml.runtime.matrix.data.Pair)

Example 2 with SequenceFileRecordReader

use of org.apache.hadoop.mapred.SequenceFileRecordReader in project systemml by apache.

the class SamplingSortMRInputFormat method writePartitionFile.

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 *
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @return index value
 * @throws IOException if something goes wrong
 * @throws InstantiationException if InstantiationException occurs
 * @throws IllegalAccessException if IllegalAccessException occurs
 */
@SuppressWarnings({ "unchecked", "unused", "deprecation" })
public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException {
    SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat();
    Sampler sampler = new Sampler();
    Class<? extends WritableComparable> targetKeyClass;
    targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class);
    // get input converter information
    int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0);
    int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0);
    // indicate whether the matrix value in this mapper is a matrix cell or a matrix block
    int partitions = conf.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 1000);
    InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
    int samples = Math.min(10, splits.length);
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.length / samples;
    // take N samples from different parts of the input
    int totalcount = 0;
    for (int i = 0; i < samples; i++) {
        SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat.getRecordReader(splits[sampleStep * i], conf, null);
        int count = 0;
        WritableComparable key = (WritableComparable) reader.createKey();
        Writable value = (Writable) reader.createValue();
        while (reader.next(key, value) && count < recordsPerSample) {
            Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0);
            inputConverter.setBlockSize(brlen, bclen);
            inputConverter.convert(key, value);
            while (inputConverter.hasNext()) {
                Pair pair = inputConverter.next();
                if (pair.getKey() instanceof DoubleWritable) {
                    sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get()));
                } else if (pair.getValue() instanceof MatrixCell) {
                    sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue()));
                } else
                    throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass());
                count++;
            }
            key = (WritableComparable) reader.createKey();
            value = (Writable) reader.createValue();
        }
        totalcount += count;
    }
    if (// empty input files
    totalcount == 0)
        sampler.addValue(new DoubleWritable(0));
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile)) {
        outFs.delete(partFile, false);
    }
    // note: key value always double/null as expected by partitioner
    SequenceFile.Writer writer = null;
    int index0 = -1;
    try {
        writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class);
        NullWritable nullValue = NullWritable.get();
        int i = 0;
        boolean lessthan0 = true;
        for (WritableComparable splitValue : sampler.createPartitions(partitions)) {
            writer.append(splitValue, nullValue);
            if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) {
                index0 = i;
                lessthan0 = false;
            }
            i++;
        }
        if (lessthan0)
            index0 = partitions - 1;
    } finally {
        IOUtilFunctions.closeSilently(writer);
    }
    return index0;
}

Example 3 with SequenceFileRecordReader

use of org.apache.hadoop.mapred.SequenceFileRecordReader in project voldemort by voldemort.

the class MysqlBuildPerformanceTest method main.

public static void main(String[] args) throws FileNotFoundException, IOException {
    if (args.length != 3)
        Utils.croak("USAGE: java " + MysqlBuildPerformanceTest.class.getName() + "serverPropsFile storeName jsonSequenceDataFile");
    String serverPropsFile = args[0];
    String storeName = args[1];
    String jsonDataFile = args[2];
    final Store<ByteArray, byte[], byte[]> store = new MysqlStorageConfiguration(new VoldemortConfig(new Props(new File(serverPropsFile)))).getStore(TestUtils.makeStoreDefinition(storeName), TestUtils.makeSingleNodeRoutingStrategy());
    final AtomicInteger obsoletes = new AtomicInteger(0);
    Path jsonFilePath = new Path(jsonDataFile);
    FileStatus jsonFileStatus = jsonFilePath.getFileSystem(new Configuration()).listStatus(jsonFilePath)[0];
    final SequenceFileRecordReader<BytesWritable, BytesWritable> reader = new SequenceFileRecordReader<BytesWritable, BytesWritable>(new Configuration(), new FileSplit(jsonFilePath, 0, jsonFileStatus.getLen(), (String[]) null));
    PerformanceTest readWriteTest = new PerformanceTest() {

        @Override
        public void doOperation(int index) throws Exception {
            try {
                BytesWritable key = new BytesWritable();
                BytesWritable value = new BytesWritable();
                reader.next(key, value);
                store.put(new ByteArray(ByteUtils.copy(key.get(), 0, key.getSize())), Versioned.value(ByteUtils.copy(value.get(), 0, value.getSize())), null);
            } catch (ObsoleteVersionException e) {
                obsoletes.incrementAndGet();
            }
        }
    };
    readWriteTest.run(1000, 1);
    System.out.println("MySQl write throuhput with one thread:");
    readWriteTest.printStats();
}

Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) MysqlStorageConfiguration(voldemort.store.mysql.MysqlStorageConfiguration) Configuration(org.apache.hadoop.conf.Configuration) SequenceFileRecordReader(org.apache.hadoop.mapred.SequenceFileRecordReader) BytesWritable(org.apache.hadoop.io.BytesWritable) Props(voldemort.utils.Props) FileSplit(org.apache.hadoop.mapred.FileSplit) VoldemortConfig(voldemort.server.VoldemortConfig) MysqlStorageConfiguration(voldemort.store.mysql.MysqlStorageConfiguration) ObsoleteVersionException(voldemort.versioning.ObsoleteVersionException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ByteArray(voldemort.utils.ByteArray) PerformanceTest(voldemort.performance.PerformanceTest) File(java.io.File)

Example 4 with SequenceFileRecordReader

use of org.apache.hadoop.mapred.SequenceFileRecordReader in project voldemort by voldemort.

the class BdbBuildPerformanceTest method main.

public static void main(String[] args) throws FileNotFoundException, IOException {
    if (args.length != 3)
        Utils.croak("USAGE: java " + BdbBuildPerformanceTest.class.getName() + "serverPropsFile storeName jsonSequenceDataFile");
    String serverPropsFile = args[0];
    String storeName = args[1];
    String jsonDataFile = args[2];
    final Store<ByteArray, byte[], byte[]> store = new BdbStorageConfiguration(new VoldemortConfig(new Props(new File(serverPropsFile)))).getStore(TestUtils.makeStoreDefinition(storeName), TestUtils.makeSingleNodeRoutingStrategy());
    final AtomicInteger obsoletes = new AtomicInteger(0);
    Path jsonFilePath = new Path(jsonDataFile);
    FileStatus jsonFileStatus = jsonFilePath.getFileSystem(new Configuration()).listStatus(jsonFilePath)[0];
    final SequenceFileRecordReader<BytesWritable, BytesWritable> reader = new SequenceFileRecordReader<BytesWritable, BytesWritable>(new Configuration(), new FileSplit(jsonFilePath, 0, jsonFileStatus.getLen(), (String[]) null));
    PerformanceTest readWriteTest = new PerformanceTest() {

        @Override
        public void doOperation(int index) throws Exception {
            try {
                BytesWritable key = new BytesWritable();
                BytesWritable value = new BytesWritable();
                reader.next(key, value);
                store.put(new ByteArray(ByteUtils.copy(key.get(), 0, key.getSize())), Versioned.value(ByteUtils.copy(value.get(), 0, value.getSize())), null);
            } catch (ObsoleteVersionException e) {
                obsoletes.incrementAndGet();
            }
        }
    };
    readWriteTest.run(30 * 1000 * 1000, 1);
    System.out.println("Bdb write throuhput with one thread:");
    readWriteTest.printStats();
}

Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) BdbStorageConfiguration(voldemort.store.bdb.BdbStorageConfiguration) Configuration(org.apache.hadoop.conf.Configuration) SequenceFileRecordReader(org.apache.hadoop.mapred.SequenceFileRecordReader) BytesWritable(org.apache.hadoop.io.BytesWritable) Props(voldemort.utils.Props) FileSplit(org.apache.hadoop.mapred.FileSplit) VoldemortConfig(voldemort.server.VoldemortConfig) ObsoleteVersionException(voldemort.versioning.ObsoleteVersionException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ByteArray(voldemort.utils.ByteArray) PerformanceTest(voldemort.performance.PerformanceTest) BdbStorageConfiguration(voldemort.store.bdb.BdbStorageConfiguration) File(java.io.File)

Aggregations

SequenceFileRecordReader (org.apache.hadoop.mapred.SequenceFileRecordReader)4 File (java.io.File)2 IOException (java.io.IOException)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 Configuration (org.apache.hadoop.conf.Configuration)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 BytesWritable (org.apache.hadoop.io.BytesWritable)2 DoubleWritable (org.apache.hadoop.io.DoubleWritable)2 NullWritable (org.apache.hadoop.io.NullWritable)2 SequenceFile (org.apache.hadoop.io.SequenceFile)2 Writable (org.apache.hadoop.io.Writable)2 WritableComparable (org.apache.hadoop.io.WritableComparable)2 FileSplit (org.apache.hadoop.mapred.FileSplit)2 InputSplit (org.apache.hadoop.mapred.InputSplit)2 Converter (org.apache.sysml.runtime.matrix.data.Converter)2 MatrixCell (org.apache.sysml.runtime.matrix.data.MatrixCell)2 Pair (org.apache.sysml.runtime.matrix.data.Pair)2 PerformanceTest (voldemort.performance.PerformanceTest)2