use of org.apache.hadoop.mapred.SequenceFileRecordReader in project incubator-systemml by apache.
the class SamplingSortMRInputFormat method writePartitionFile.
/**
* Use the input splits to take samples of the input and generate sample
* keys. By default reads 100,000 keys from 10 locations in the input, sorts
* them and picks N-1 keys to generate N equally sized partitions.
*
* @param conf the job to sample
* @param partFile where to write the output file to
* @return index value
* @throws IOException if something goes wrong
* @throws InstantiationException if InstantiationException occurs
* @throws IllegalAccessException if IllegalAccessException occurs
*/
@SuppressWarnings({ "unchecked", "unused", "deprecation" })
public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException {
SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat();
Sampler sampler = new Sampler();
Class<? extends WritableComparable> targetKeyClass;
targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class);
// get input converter information
int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0);
int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0);
// indicate whether the matrix value in this mapper is a matrix cell or a matrix block
int partitions = conf.getNumReduceTasks();
long sampleSize = conf.getLong(SAMPLE_SIZE, 1000);
InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
int samples = Math.min(10, splits.length);
long recordsPerSample = sampleSize / samples;
int sampleStep = splits.length / samples;
// take N samples from different parts of the input
int totalcount = 0;
for (int i = 0; i < samples; i++) {
SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat.getRecordReader(splits[sampleStep * i], conf, null);
int count = 0;
WritableComparable key = (WritableComparable) reader.createKey();
Writable value = (Writable) reader.createValue();
while (reader.next(key, value) && count < recordsPerSample) {
Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0);
inputConverter.setBlockSize(brlen, bclen);
inputConverter.convert(key, value);
while (inputConverter.hasNext()) {
Pair pair = inputConverter.next();
if (pair.getKey() instanceof DoubleWritable) {
sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get()));
} else if (pair.getValue() instanceof MatrixCell) {
sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue()));
} else
throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass());
count++;
}
key = (WritableComparable) reader.createKey();
value = (Writable) reader.createValue();
}
totalcount += count;
}
if (// empty input files
totalcount == 0)
sampler.addValue(new DoubleWritable(0));
FileSystem outFs = partFile.getFileSystem(conf);
if (outFs.exists(partFile)) {
outFs.delete(partFile, false);
}
// note: key value always double/null as expected by partitioner
SequenceFile.Writer writer = null;
int index0 = -1;
try {
writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class);
NullWritable nullValue = NullWritable.get();
int i = 0;
boolean lessthan0 = true;
for (WritableComparable splitValue : sampler.createPartitions(partitions)) {
writer.append(splitValue, nullValue);
if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) {
index0 = i;
lessthan0 = false;
}
i++;
}
if (lessthan0)
index0 = partitions - 1;
} finally {
IOUtilFunctions.closeSilently(writer);
}
return index0;
}
use of org.apache.hadoop.mapred.SequenceFileRecordReader in project systemml by apache.
the class SamplingSortMRInputFormat method writePartitionFile.
/**
* Use the input splits to take samples of the input and generate sample
* keys. By default reads 100,000 keys from 10 locations in the input, sorts
* them and picks N-1 keys to generate N equally sized partitions.
*
* @param conf the job to sample
* @param partFile where to write the output file to
* @return index value
* @throws IOException if something goes wrong
* @throws InstantiationException if InstantiationException occurs
* @throws IllegalAccessException if IllegalAccessException occurs
*/
@SuppressWarnings({ "unchecked", "unused", "deprecation" })
public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException {
SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat();
Sampler sampler = new Sampler();
Class<? extends WritableComparable> targetKeyClass;
targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class);
// get input converter information
int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0);
int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0);
// indicate whether the matrix value in this mapper is a matrix cell or a matrix block
int partitions = conf.getNumReduceTasks();
long sampleSize = conf.getLong(SAMPLE_SIZE, 1000);
InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
int samples = Math.min(10, splits.length);
long recordsPerSample = sampleSize / samples;
int sampleStep = splits.length / samples;
// take N samples from different parts of the input
int totalcount = 0;
for (int i = 0; i < samples; i++) {
SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat.getRecordReader(splits[sampleStep * i], conf, null);
int count = 0;
WritableComparable key = (WritableComparable) reader.createKey();
Writable value = (Writable) reader.createValue();
while (reader.next(key, value) && count < recordsPerSample) {
Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0);
inputConverter.setBlockSize(brlen, bclen);
inputConverter.convert(key, value);
while (inputConverter.hasNext()) {
Pair pair = inputConverter.next();
if (pair.getKey() instanceof DoubleWritable) {
sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get()));
} else if (pair.getValue() instanceof MatrixCell) {
sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue()));
} else
throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass());
count++;
}
key = (WritableComparable) reader.createKey();
value = (Writable) reader.createValue();
}
totalcount += count;
}
if (// empty input files
totalcount == 0)
sampler.addValue(new DoubleWritable(0));
FileSystem outFs = partFile.getFileSystem(conf);
if (outFs.exists(partFile)) {
outFs.delete(partFile, false);
}
// note: key value always double/null as expected by partitioner
SequenceFile.Writer writer = null;
int index0 = -1;
try {
writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class);
NullWritable nullValue = NullWritable.get();
int i = 0;
boolean lessthan0 = true;
for (WritableComparable splitValue : sampler.createPartitions(partitions)) {
writer.append(splitValue, nullValue);
if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) {
index0 = i;
lessthan0 = false;
}
i++;
}
if (lessthan0)
index0 = partitions - 1;
} finally {
IOUtilFunctions.closeSilently(writer);
}
return index0;
}
use of org.apache.hadoop.mapred.SequenceFileRecordReader in project voldemort by voldemort.
the class MysqlBuildPerformanceTest method main.
public static void main(String[] args) throws FileNotFoundException, IOException {
if (args.length != 3)
Utils.croak("USAGE: java " + MysqlBuildPerformanceTest.class.getName() + "serverPropsFile storeName jsonSequenceDataFile");
String serverPropsFile = args[0];
String storeName = args[1];
String jsonDataFile = args[2];
final Store<ByteArray, byte[], byte[]> store = new MysqlStorageConfiguration(new VoldemortConfig(new Props(new File(serverPropsFile)))).getStore(TestUtils.makeStoreDefinition(storeName), TestUtils.makeSingleNodeRoutingStrategy());
final AtomicInteger obsoletes = new AtomicInteger(0);
Path jsonFilePath = new Path(jsonDataFile);
FileStatus jsonFileStatus = jsonFilePath.getFileSystem(new Configuration()).listStatus(jsonFilePath)[0];
final SequenceFileRecordReader<BytesWritable, BytesWritable> reader = new SequenceFileRecordReader<BytesWritable, BytesWritable>(new Configuration(), new FileSplit(jsonFilePath, 0, jsonFileStatus.getLen(), (String[]) null));
PerformanceTest readWriteTest = new PerformanceTest() {
@Override
public void doOperation(int index) throws Exception {
try {
BytesWritable key = new BytesWritable();
BytesWritable value = new BytesWritable();
reader.next(key, value);
store.put(new ByteArray(ByteUtils.copy(key.get(), 0, key.getSize())), Versioned.value(ByteUtils.copy(value.get(), 0, value.getSize())), null);
} catch (ObsoleteVersionException e) {
obsoletes.incrementAndGet();
}
}
};
readWriteTest.run(1000, 1);
System.out.println("MySQl write throuhput with one thread:");
readWriteTest.printStats();
}
use of org.apache.hadoop.mapred.SequenceFileRecordReader in project voldemort by voldemort.
the class BdbBuildPerformanceTest method main.
public static void main(String[] args) throws FileNotFoundException, IOException {
if (args.length != 3)
Utils.croak("USAGE: java " + BdbBuildPerformanceTest.class.getName() + "serverPropsFile storeName jsonSequenceDataFile");
String serverPropsFile = args[0];
String storeName = args[1];
String jsonDataFile = args[2];
final Store<ByteArray, byte[], byte[]> store = new BdbStorageConfiguration(new VoldemortConfig(new Props(new File(serverPropsFile)))).getStore(TestUtils.makeStoreDefinition(storeName), TestUtils.makeSingleNodeRoutingStrategy());
final AtomicInteger obsoletes = new AtomicInteger(0);
Path jsonFilePath = new Path(jsonDataFile);
FileStatus jsonFileStatus = jsonFilePath.getFileSystem(new Configuration()).listStatus(jsonFilePath)[0];
final SequenceFileRecordReader<BytesWritable, BytesWritable> reader = new SequenceFileRecordReader<BytesWritable, BytesWritable>(new Configuration(), new FileSplit(jsonFilePath, 0, jsonFileStatus.getLen(), (String[]) null));
PerformanceTest readWriteTest = new PerformanceTest() {
@Override
public void doOperation(int index) throws Exception {
try {
BytesWritable key = new BytesWritable();
BytesWritable value = new BytesWritable();
reader.next(key, value);
store.put(new ByteArray(ByteUtils.copy(key.get(), 0, key.getSize())), Versioned.value(ByteUtils.copy(value.get(), 0, value.getSize())), null);
} catch (ObsoleteVersionException e) {
obsoletes.incrementAndGet();
}
}
};
readWriteTest.run(30 * 1000 * 1000, 1);
System.out.println("Bdb write throuhput with one thread:");
readWriteTest.printStats();
}
Aggregations