use of org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics in project hadoop by apache.
the class Gridmix method start.
/**
*
* @param conf gridmix configuration
* @param traceIn trace file path(if it is '-', then trace comes from the
* stream stdin)
* @param ioPath Working directory for gridmix. GenerateData job
* will generate data in the directory <ioPath>/input/ and
* distributed cache data is generated in the directory
* <ioPath>/distributedCache/, if -generate option is
* specified.
* @param genbytes size of input data to be generated under the directory
* <ioPath>/input/
* @param userResolver gridmix user resolver
* @return exit code
* @throws IOException
* @throws InterruptedException
*/
int start(Configuration conf, String traceIn, Path ioPath, long genbytes, UserResolver userResolver) throws IOException, InterruptedException {
DataStatistics stats = null;
InputStream trace = null;
int exitCode = 0;
try {
Path scratchDir = new Path(ioPath, conf.get(GRIDMIX_OUT_DIR, "gridmix"));
// add shutdown hook for SIGINT, etc.
Runtime.getRuntime().addShutdownHook(sdh);
CountDownLatch startFlag = new CountDownLatch(1);
try {
// Create, start job submission threads
startThreads(conf, traceIn, ioPath, scratchDir, startFlag, userResolver);
Path inputDir = getGridmixInputDataPath(ioPath);
// Write input data if specified
exitCode = writeInputData(genbytes, inputDir);
if (exitCode != 0) {
return exitCode;
}
// publish the data statistics
stats = GenerateData.publishDataStatistics(inputDir, genbytes, conf);
// scan input dir contents
submitter.refreshFilePool();
boolean shouldGenerate = (genbytes > 0);
// set up the needed things for emulation of various loads
exitCode = setupEmulation(conf, traceIn, scratchDir, ioPath, shouldGenerate);
if (exitCode != 0) {
return exitCode;
}
// start the summarizer
summarizer.start(conf);
factory.start();
statistics.start();
} catch (Throwable e) {
LOG.error("Startup failed. " + e.toString() + "\n");
if (LOG.isDebugEnabled()) {
e.printStackTrace();
}
// abort pipeline
if (factory != null)
factory.abort();
exitCode = STARTUP_FAILED_ERROR;
} finally {
// signal for factory to start; sets start time
startFlag.countDown();
}
if (factory != null) {
// wait for input exhaustion
factory.join(Long.MAX_VALUE);
final Throwable badTraceException = factory.error();
if (null != badTraceException) {
LOG.error("Error in trace", badTraceException);
throw new IOException("Error in trace", badTraceException);
}
// wait for pending tasks to be submitted
submitter.shutdown();
submitter.join(Long.MAX_VALUE);
// wait for running tasks to complete
monitor.shutdown();
monitor.join(Long.MAX_VALUE);
statistics.shutdown();
statistics.join(Long.MAX_VALUE);
}
} finally {
if (factory != null) {
summarizer.finalize(factory, traceIn, genbytes, userResolver, stats, conf);
}
IOUtils.cleanup(LOG, trace);
}
return exitCode;
}
use of org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics in project hadoop by apache.
the class CompressionEmulationUtil method publishCompressedDataStatistics.
/** Publishes compression related data statistics. Following statistics are
* published
* <ul>
* <li>Total compressed input data size</li>
* <li>Number of compressed input data files</li>
* <li>Compression Ratio</li>
* <li>Text data dictionary size</li>
* <li>Random text word size</li>
* </ul>
*/
static DataStatistics publishCompressedDataStatistics(Path inputDir, Configuration conf, long uncompressedDataSize) throws IOException {
FileSystem fs = inputDir.getFileSystem(conf);
CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
// iterate over compressed files and sum up the compressed file sizes
long compressedDataSize = 0;
int numCompressedFiles = 0;
// obtain input data file statuses
FileStatus[] outFileStatuses = fs.listStatus(inputDir, new Utils.OutputFileUtils.OutputFilesFilter());
for (FileStatus status : outFileStatuses) {
// check if the input file is compressed
if (compressionCodecs != null) {
CompressionCodec codec = compressionCodecs.getCodec(status.getPath());
if (codec != null) {
++numCompressedFiles;
compressedDataSize += status.getLen();
}
}
}
LOG.info("Gridmix is configured to use compressed input data.");
// publish the input data size
LOG.info("Total size of compressed input data : " + StringUtils.humanReadableInt(compressedDataSize));
LOG.info("Total number of compressed input data files : " + numCompressedFiles);
if (numCompressedFiles == 0) {
throw new RuntimeException("No compressed file found in the input" + " directory : " + inputDir.toString() + ". To enable compression" + " emulation, run Gridmix either with " + " an input directory containing compressed input file(s) or" + " use the -generate option to (re)generate it. If compression" + " emulation is not desired, disable it by setting '" + COMPRESSION_EMULATION_ENABLE + "' to 'false'.");
}
// publish compression ratio only if its generated in this gridmix run
if (uncompressedDataSize > 0) {
// compute the compression ratio
double ratio = ((double) compressedDataSize) / uncompressedDataSize;
// publish the compression ratio
LOG.info("Input Data Compression Ratio : " + ratio);
}
return new DataStatistics(compressedDataSize, numCompressedFiles, true);
}
use of org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics in project hadoop by apache.
the class TestGridmixSummary method testExecutionSummarizer.
/**
* Test {@link ExecutionSummarizer}.
*/
@Test
@SuppressWarnings({ "unchecked", "rawtypes" })
public void testExecutionSummarizer() throws IOException {
Configuration conf = new Configuration();
ExecutionSummarizer es = new ExecutionSummarizer();
assertEquals("ExecutionSummarizer init failed", Summarizer.NA, es.getCommandLineArgsString());
long startTime = System.currentTimeMillis();
// test configuration parameters
String[] initArgs = new String[] { "-Xmx20m", "-Dtest.args='test'" };
es = new ExecutionSummarizer(initArgs);
assertEquals("ExecutionSummarizer init failed", "-Xmx20m -Dtest.args='test'", es.getCommandLineArgsString());
// test start time
assertTrue("Start time mismatch", es.getStartTime() >= startTime);
assertTrue("Start time mismatch", es.getStartTime() <= System.currentTimeMillis());
// test start() of ExecutionSummarizer
es.update(null);
assertEquals("ExecutionSummarizer init failed", 0, es.getSimulationStartTime());
testExecutionSummarizer(0, 0, 0, 0, 0, 0, 0, es);
long simStartTime = System.currentTimeMillis();
es.start(null);
assertTrue("Simulation start time mismatch", es.getSimulationStartTime() >= simStartTime);
assertTrue("Simulation start time mismatch", es.getSimulationStartTime() <= System.currentTimeMillis());
// test with job stats
JobStats stats = generateFakeJobStats(1, 10, true, false);
es.update(stats);
testExecutionSummarizer(1, 10, 0, 1, 1, 0, 0, es);
// test with failed job
stats = generateFakeJobStats(5, 1, false, false);
es.update(stats);
testExecutionSummarizer(6, 11, 0, 2, 1, 1, 0, es);
// test with successful but lost job
stats = generateFakeJobStats(1, 1, true, true);
es.update(stats);
testExecutionSummarizer(7, 12, 0, 3, 1, 1, 1, es);
// test with failed but lost job
stats = generateFakeJobStats(2, 2, false, true);
es.update(stats);
testExecutionSummarizer(9, 14, 0, 4, 1, 1, 2, es);
// test finalize
// define a fake job factory
JobFactory factory = new FakeJobFactory(conf);
// fake the num jobs in trace
factory.numJobsInTrace = 3;
Path rootTempDir = new Path(System.getProperty("test.build.data", "/tmp"));
Path testDir = new Path(rootTempDir, "testGridmixSummary");
Path testTraceFile = new Path(testDir, "test-trace.json");
FileSystem fs = FileSystem.getLocal(conf);
fs.create(testTraceFile).close();
// finalize the summarizer
UserResolver resolver = new RoundRobinUserResolver();
DataStatistics dataStats = new DataStatistics(100, 2, true);
String policy = GridmixJobSubmissionPolicy.REPLAY.name();
conf.set(GridmixJobSubmissionPolicy.JOB_SUBMISSION_POLICY, policy);
es.finalize(factory, testTraceFile.toString(), 1024L, resolver, dataStats, conf);
// test num jobs in trace
assertEquals("Mismtach in num jobs in trace", 3, es.getNumJobsInTrace());
// test trace signature
String tid = ExecutionSummarizer.getTraceSignature(testTraceFile.toString());
assertEquals("Mismatch in trace signature", tid, es.getInputTraceSignature());
// test trace location
Path qPath = fs.makeQualified(testTraceFile);
assertEquals("Mismatch in trace filename", qPath.toString(), es.getInputTraceLocation());
// test expected data size
assertEquals("Mismatch in expected data size", "1 K", es.getExpectedDataSize());
// test input data statistics
assertEquals("Mismatch in input data statistics", ExecutionSummarizer.stringifyDataStatistics(dataStats), es.getInputDataStatistics());
// test user resolver
assertEquals("Mismatch in user resolver", resolver.getClass().getName(), es.getUserResolver());
// test policy
assertEquals("Mismatch in policy", policy, es.getJobSubmissionPolicy());
// test data stringification using large data
es.finalize(factory, testTraceFile.toString(), 1024 * 1024 * 1024 * 10L, resolver, dataStats, conf);
assertEquals("Mismatch in expected data size", "10 G", es.getExpectedDataSize());
// test trace signature uniqueness
// touch the trace file
fs.delete(testTraceFile, false);
// sleep for 1 sec
try {
Thread.sleep(1000);
} catch (InterruptedException ie) {
}
fs.create(testTraceFile).close();
es.finalize(factory, testTraceFile.toString(), 0L, resolver, dataStats, conf);
// test missing expected data size
assertEquals("Mismatch in trace data size", Summarizer.NA, es.getExpectedDataSize());
assertFalse("Mismatch in trace signature", tid.equals(es.getInputTraceSignature()));
// get the new identifier
tid = ExecutionSummarizer.getTraceSignature(testTraceFile.toString());
assertEquals("Mismatch in trace signature", tid, es.getInputTraceSignature());
testTraceFile = new Path(testDir, "test-trace2.json");
fs.create(testTraceFile).close();
es.finalize(factory, testTraceFile.toString(), 0L, resolver, dataStats, conf);
assertFalse("Mismatch in trace signature", tid.equals(es.getInputTraceSignature()));
// get the new identifier
tid = ExecutionSummarizer.getTraceSignature(testTraceFile.toString());
assertEquals("Mismatch in trace signature", tid, es.getInputTraceSignature());
// finalize trace identifier '-' input
es.finalize(factory, "-", 0L, resolver, dataStats, conf);
assertEquals("Mismatch in trace signature", Summarizer.NA, es.getInputTraceSignature());
assertEquals("Mismatch in trace file location", Summarizer.NA, es.getInputTraceLocation());
}
use of org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics in project hadoop by apache.
the class TestGridmixSummary method testDataStatistics.
/**
* Test {@link DataStatistics}.
*/
@Test
public void testDataStatistics() throws Exception {
// test data-statistics getters with compression enabled
DataStatistics stats = new DataStatistics(10, 2, true);
assertEquals("Data size mismatch", 10, stats.getDataSize());
assertEquals("Num files mismatch", 2, stats.getNumFiles());
assertTrue("Compression configuration mismatch", stats.isDataCompressed());
// test data-statistics getters with compression disabled
stats = new DataStatistics(100, 5, false);
assertEquals("Data size mismatch", 100, stats.getDataSize());
assertEquals("Num files mismatch", 5, stats.getNumFiles());
assertFalse("Compression configuration mismatch", stats.isDataCompressed());
// test publish data stats
Configuration conf = new Configuration();
Path rootTempDir = new Path(System.getProperty("test.build.data", "/tmp"));
Path testDir = new Path(rootTempDir, "testDataStatistics");
FileSystem fs = testDir.getFileSystem(conf);
fs.delete(testDir, true);
Path testInputDir = new Path(testDir, "test");
fs.mkdirs(testInputDir);
// test empty folder (compression = true)
CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
Boolean failed = null;
try {
GenerateData.publishDataStatistics(testInputDir, 1024L, conf);
failed = false;
} catch (RuntimeException e) {
failed = true;
}
assertNotNull("Expected failure!", failed);
assertTrue("Compression data publishing error", failed);
// test with empty folder (compression = off)
CompressionEmulationUtil.setCompressionEmulationEnabled(conf, false);
stats = GenerateData.publishDataStatistics(testInputDir, 1024L, conf);
assertEquals("Data size mismatch", 0, stats.getDataSize());
assertEquals("Num files mismatch", 0, stats.getNumFiles());
assertFalse("Compression configuration mismatch", stats.isDataCompressed());
// test with some plain input data (compression = off)
CompressionEmulationUtil.setCompressionEmulationEnabled(conf, false);
Path inputDataFile = new Path(testInputDir, "test");
long size = UtilsForTests.createTmpFileDFS(fs, inputDataFile, FsPermission.createImmutable((short) 777), "hi hello bye").size();
stats = GenerateData.publishDataStatistics(testInputDir, -1, conf);
assertEquals("Data size mismatch", size, stats.getDataSize());
assertEquals("Num files mismatch", 1, stats.getNumFiles());
assertFalse("Compression configuration mismatch", stats.isDataCompressed());
// test with some plain input data (compression = on)
CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
failed = null;
try {
GenerateData.publishDataStatistics(testInputDir, 1234L, conf);
failed = false;
} catch (RuntimeException e) {
failed = true;
}
assertNotNull("Expected failure!", failed);
assertTrue("Compression data publishing error", failed);
// test with some compressed input data (compression = off)
CompressionEmulationUtil.setCompressionEmulationEnabled(conf, false);
fs.delete(inputDataFile, false);
inputDataFile = new Path(testInputDir, "test.gz");
size = UtilsForTests.createTmpFileDFS(fs, inputDataFile, FsPermission.createImmutable((short) 777), "hi hello").size();
stats = GenerateData.publishDataStatistics(testInputDir, 1234L, conf);
assertEquals("Data size mismatch", size, stats.getDataSize());
assertEquals("Num files mismatch", 1, stats.getNumFiles());
assertFalse("Compression configuration mismatch", stats.isDataCompressed());
// test with some compressed input data (compression = on)
CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
stats = GenerateData.publishDataStatistics(testInputDir, 1234L, conf);
assertEquals("Data size mismatch", size, stats.getDataSize());
assertEquals("Num files mismatch", 1, stats.getNumFiles());
assertTrue("Compression configuration mismatch", stats.isDataCompressed());
}
Aggregations