Examples with DataStatistics - org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics

Example 1 with DataStatistics

use of org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics in project hadoop by apache.

the class Gridmix method start.

/**
   * 
   * @param conf gridmix configuration
   * @param traceIn trace file path(if it is '-', then trace comes from the
   *                stream stdin)
   * @param ioPath Working directory for gridmix. GenerateData job
   *               will generate data in the directory &lt;ioPath&gt;/input/ and
   *               distributed cache data is generated in the directory
   *               &lt;ioPath&gt;/distributedCache/, if -generate option is
   *               specified.
   * @param genbytes size of input data to be generated under the directory
   *                 &lt;ioPath&gt;/input/
   * @param userResolver gridmix user resolver
   * @return exit code
   * @throws IOException
   * @throws InterruptedException
   */
int start(Configuration conf, String traceIn, Path ioPath, long genbytes, UserResolver userResolver) throws IOException, InterruptedException {
    DataStatistics stats = null;
    InputStream trace = null;
    int exitCode = 0;
    try {
        Path scratchDir = new Path(ioPath, conf.get(GRIDMIX_OUT_DIR, "gridmix"));
        // add shutdown hook for SIGINT, etc.
        Runtime.getRuntime().addShutdownHook(sdh);
        CountDownLatch startFlag = new CountDownLatch(1);
        try {
            // Create, start job submission threads
            startThreads(conf, traceIn, ioPath, scratchDir, startFlag, userResolver);
            Path inputDir = getGridmixInputDataPath(ioPath);
            // Write input data if specified
            exitCode = writeInputData(genbytes, inputDir);
            if (exitCode != 0) {
                return exitCode;
            }
            // publish the data statistics
            stats = GenerateData.publishDataStatistics(inputDir, genbytes, conf);
            // scan input dir contents
            submitter.refreshFilePool();
            boolean shouldGenerate = (genbytes > 0);
            // set up the needed things for emulation of various loads
            exitCode = setupEmulation(conf, traceIn, scratchDir, ioPath, shouldGenerate);
            if (exitCode != 0) {
                return exitCode;
            }
            // start the summarizer
            summarizer.start(conf);
            factory.start();
            statistics.start();
        } catch (Throwable e) {
            LOG.error("Startup failed. " + e.toString() + "\n");
            if (LOG.isDebugEnabled()) {
                e.printStackTrace();
            }
            // abort pipeline
            if (factory != null)
                factory.abort();
            exitCode = STARTUP_FAILED_ERROR;
        } finally {
            // signal for factory to start; sets start time
            startFlag.countDown();
        }
        if (factory != null) {
            // wait for input exhaustion
            factory.join(Long.MAX_VALUE);
            final Throwable badTraceException = factory.error();
            if (null != badTraceException) {
                LOG.error("Error in trace", badTraceException);
                throw new IOException("Error in trace", badTraceException);
            }
            // wait for pending tasks to be submitted
            submitter.shutdown();
            submitter.join(Long.MAX_VALUE);
            // wait for running tasks to complete
            monitor.shutdown();
            monitor.join(Long.MAX_VALUE);
            statistics.shutdown();
            statistics.join(Long.MAX_VALUE);
        }
    } finally {
        if (factory != null) {
            summarizer.finalize(factory, traceIn, genbytes, userResolver, stats, conf);
        }
        IOUtils.cleanup(LOG, trace);
    }
    return exitCode;
}

Also used : Path(org.apache.hadoop.fs.Path) InputStream(java.io.InputStream) DataStatistics(org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics) IOException(java.io.IOException) CountDownLatch(java.util.concurrent.CountDownLatch)

Example 2 with DataStatistics

use of org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics in project hadoop by apache.

the class CompressionEmulationUtil method publishCompressedDataStatistics.

/** Publishes compression related data statistics. Following statistics are
   * published
   * <ul>
   *   <li>Total compressed input data size</li>
   *   <li>Number of compressed input data files</li>
   *   <li>Compression Ratio</li>
   *   <li>Text data dictionary size</li>
   *   <li>Random text word size</li>
   * </ul>
   */
static DataStatistics publishCompressedDataStatistics(Path inputDir, Configuration conf, long uncompressedDataSize) throws IOException {
    FileSystem fs = inputDir.getFileSystem(conf);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
    // iterate over compressed files and sum up the compressed file sizes
    long compressedDataSize = 0;
    int numCompressedFiles = 0;
    // obtain input data file statuses
    FileStatus[] outFileStatuses = fs.listStatus(inputDir, new Utils.OutputFileUtils.OutputFilesFilter());
    for (FileStatus status : outFileStatuses) {
        // check if the input file is compressed
        if (compressionCodecs != null) {
            CompressionCodec codec = compressionCodecs.getCodec(status.getPath());
            if (codec != null) {
                ++numCompressedFiles;
                compressedDataSize += status.getLen();
            }
        }
    }
    LOG.info("Gridmix is configured to use compressed input data.");
    // publish the input data size
    LOG.info("Total size of compressed input data : " + StringUtils.humanReadableInt(compressedDataSize));
    LOG.info("Total number of compressed input data files : " + numCompressedFiles);
    if (numCompressedFiles == 0) {
        throw new RuntimeException("No compressed file found in the input" + " directory : " + inputDir.toString() + ". To enable compression" + " emulation, run Gridmix either with " + " an input directory containing compressed input file(s) or" + " use the -generate option to (re)generate it. If compression" + " emulation is not desired, disable it by setting '" + COMPRESSION_EMULATION_ENABLE + "' to 'false'.");
    }
    // publish compression ratio only if its generated in this gridmix run
    if (uncompressedDataSize > 0) {
        // compute the compression ratio
        double ratio = ((double) compressedDataSize) / uncompressedDataSize;
        // publish the compression ratio
        LOG.info("Input Data Compression Ratio : " + ratio);
    }
    return new DataStatistics(compressedDataSize, numCompressedFiles, true);
}

Also used : FileStatus(org.apache.hadoop.fs.FileStatus) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) FileSystem(org.apache.hadoop.fs.FileSystem) DataStatistics(org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 3 with DataStatistics

use of org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics in project hadoop by apache.

the class TestGridmixSummary method testExecutionSummarizer.

/**
   * Test {@link ExecutionSummarizer}.
   */
@Test
@SuppressWarnings({ "unchecked", "rawtypes" })
public void testExecutionSummarizer() throws IOException {
    Configuration conf = new Configuration();
    ExecutionSummarizer es = new ExecutionSummarizer();
    assertEquals("ExecutionSummarizer init failed", Summarizer.NA, es.getCommandLineArgsString());
    long startTime = System.currentTimeMillis();
    // test configuration parameters
    String[] initArgs = new String[] { "-Xmx20m", "-Dtest.args='test'" };
    es = new ExecutionSummarizer(initArgs);
    assertEquals("ExecutionSummarizer init failed", "-Xmx20m -Dtest.args='test'", es.getCommandLineArgsString());
    // test start time
    assertTrue("Start time mismatch", es.getStartTime() >= startTime);
    assertTrue("Start time mismatch", es.getStartTime() <= System.currentTimeMillis());
    // test start() of ExecutionSummarizer
    es.update(null);
    assertEquals("ExecutionSummarizer init failed", 0, es.getSimulationStartTime());
    testExecutionSummarizer(0, 0, 0, 0, 0, 0, 0, es);
    long simStartTime = System.currentTimeMillis();
    es.start(null);
    assertTrue("Simulation start time mismatch", es.getSimulationStartTime() >= simStartTime);
    assertTrue("Simulation start time mismatch", es.getSimulationStartTime() <= System.currentTimeMillis());
    // test with job stats
    JobStats stats = generateFakeJobStats(1, 10, true, false);
    es.update(stats);
    testExecutionSummarizer(1, 10, 0, 1, 1, 0, 0, es);
    // test with failed job 
    stats = generateFakeJobStats(5, 1, false, false);
    es.update(stats);
    testExecutionSummarizer(6, 11, 0, 2, 1, 1, 0, es);
    // test with successful but lost job 
    stats = generateFakeJobStats(1, 1, true, true);
    es.update(stats);
    testExecutionSummarizer(7, 12, 0, 3, 1, 1, 1, es);
    // test with failed but lost job 
    stats = generateFakeJobStats(2, 2, false, true);
    es.update(stats);
    testExecutionSummarizer(9, 14, 0, 4, 1, 1, 2, es);
    // test finalize
    //  define a fake job factory
    JobFactory factory = new FakeJobFactory(conf);
    // fake the num jobs in trace
    factory.numJobsInTrace = 3;
    Path rootTempDir = new Path(System.getProperty("test.build.data", "/tmp"));
    Path testDir = new Path(rootTempDir, "testGridmixSummary");
    Path testTraceFile = new Path(testDir, "test-trace.json");
    FileSystem fs = FileSystem.getLocal(conf);
    fs.create(testTraceFile).close();
    // finalize the summarizer
    UserResolver resolver = new RoundRobinUserResolver();
    DataStatistics dataStats = new DataStatistics(100, 2, true);
    String policy = GridmixJobSubmissionPolicy.REPLAY.name();
    conf.set(GridmixJobSubmissionPolicy.JOB_SUBMISSION_POLICY, policy);
    es.finalize(factory, testTraceFile.toString(), 1024L, resolver, dataStats, conf);
    // test num jobs in trace
    assertEquals("Mismtach in num jobs in trace", 3, es.getNumJobsInTrace());
    // test trace signature
    String tid = ExecutionSummarizer.getTraceSignature(testTraceFile.toString());
    assertEquals("Mismatch in trace signature", tid, es.getInputTraceSignature());
    // test trace location
    Path qPath = fs.makeQualified(testTraceFile);
    assertEquals("Mismatch in trace filename", qPath.toString(), es.getInputTraceLocation());
    // test expected data size
    assertEquals("Mismatch in expected data size", "1 K", es.getExpectedDataSize());
    // test input data statistics
    assertEquals("Mismatch in input data statistics", ExecutionSummarizer.stringifyDataStatistics(dataStats), es.getInputDataStatistics());
    // test user resolver
    assertEquals("Mismatch in user resolver", resolver.getClass().getName(), es.getUserResolver());
    // test policy
    assertEquals("Mismatch in policy", policy, es.getJobSubmissionPolicy());
    // test data stringification using large data
    es.finalize(factory, testTraceFile.toString(), 1024 * 1024 * 1024 * 10L, resolver, dataStats, conf);
    assertEquals("Mismatch in expected data size", "10 G", es.getExpectedDataSize());
    // test trace signature uniqueness
    //  touch the trace file
    fs.delete(testTraceFile, false);
    //  sleep for 1 sec
    try {
        Thread.sleep(1000);
    } catch (InterruptedException ie) {
    }
    fs.create(testTraceFile).close();
    es.finalize(factory, testTraceFile.toString(), 0L, resolver, dataStats, conf);
    // test missing expected data size
    assertEquals("Mismatch in trace data size", Summarizer.NA, es.getExpectedDataSize());
    assertFalse("Mismatch in trace signature", tid.equals(es.getInputTraceSignature()));
    // get the new identifier
    tid = ExecutionSummarizer.getTraceSignature(testTraceFile.toString());
    assertEquals("Mismatch in trace signature", tid, es.getInputTraceSignature());
    testTraceFile = new Path(testDir, "test-trace2.json");
    fs.create(testTraceFile).close();
    es.finalize(factory, testTraceFile.toString(), 0L, resolver, dataStats, conf);
    assertFalse("Mismatch in trace signature", tid.equals(es.getInputTraceSignature()));
    // get the new identifier
    tid = ExecutionSummarizer.getTraceSignature(testTraceFile.toString());
    assertEquals("Mismatch in trace signature", tid, es.getInputTraceSignature());
    // finalize trace identifier '-' input
    es.finalize(factory, "-", 0L, resolver, dataStats, conf);
    assertEquals("Mismatch in trace signature", Summarizer.NA, es.getInputTraceSignature());
    assertEquals("Mismatch in trace file location", Summarizer.NA, es.getInputTraceLocation());
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) DataStatistics(org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics) JobStats(org.apache.hadoop.mapred.gridmix.Statistics.JobStats) FileSystem(org.apache.hadoop.fs.FileSystem) Test(org.junit.Test)

Example 4 with DataStatistics

use of org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics in project hadoop by apache.

the class TestGridmixSummary method testDataStatistics.

/**
   * Test {@link DataStatistics}.
   */
@Test
public void testDataStatistics() throws Exception {
    // test data-statistics getters with compression enabled
    DataStatistics stats = new DataStatistics(10, 2, true);
    assertEquals("Data size mismatch", 10, stats.getDataSize());
    assertEquals("Num files mismatch", 2, stats.getNumFiles());
    assertTrue("Compression configuration mismatch", stats.isDataCompressed());
    // test data-statistics getters with compression disabled
    stats = new DataStatistics(100, 5, false);
    assertEquals("Data size mismatch", 100, stats.getDataSize());
    assertEquals("Num files mismatch", 5, stats.getNumFiles());
    assertFalse("Compression configuration mismatch", stats.isDataCompressed());
    // test publish data stats
    Configuration conf = new Configuration();
    Path rootTempDir = new Path(System.getProperty("test.build.data", "/tmp"));
    Path testDir = new Path(rootTempDir, "testDataStatistics");
    FileSystem fs = testDir.getFileSystem(conf);
    fs.delete(testDir, true);
    Path testInputDir = new Path(testDir, "test");
    fs.mkdirs(testInputDir);
    // test empty folder (compression = true)
    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
    Boolean failed = null;
    try {
        GenerateData.publishDataStatistics(testInputDir, 1024L, conf);
        failed = false;
    } catch (RuntimeException e) {
        failed = true;
    }
    assertNotNull("Expected failure!", failed);
    assertTrue("Compression data publishing error", failed);
    // test with empty folder (compression = off)
    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, false);
    stats = GenerateData.publishDataStatistics(testInputDir, 1024L, conf);
    assertEquals("Data size mismatch", 0, stats.getDataSize());
    assertEquals("Num files mismatch", 0, stats.getNumFiles());
    assertFalse("Compression configuration mismatch", stats.isDataCompressed());
    // test with some plain input data (compression = off)
    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, false);
    Path inputDataFile = new Path(testInputDir, "test");
    long size = UtilsForTests.createTmpFileDFS(fs, inputDataFile, FsPermission.createImmutable((short) 777), "hi hello bye").size();
    stats = GenerateData.publishDataStatistics(testInputDir, -1, conf);
    assertEquals("Data size mismatch", size, stats.getDataSize());
    assertEquals("Num files mismatch", 1, stats.getNumFiles());
    assertFalse("Compression configuration mismatch", stats.isDataCompressed());
    // test with some plain input data (compression = on)
    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
    failed = null;
    try {
        GenerateData.publishDataStatistics(testInputDir, 1234L, conf);
        failed = false;
    } catch (RuntimeException e) {
        failed = true;
    }
    assertNotNull("Expected failure!", failed);
    assertTrue("Compression data publishing error", failed);
    // test with some compressed input data (compression = off)
    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, false);
    fs.delete(inputDataFile, false);
    inputDataFile = new Path(testInputDir, "test.gz");
    size = UtilsForTests.createTmpFileDFS(fs, inputDataFile, FsPermission.createImmutable((short) 777), "hi hello").size();
    stats = GenerateData.publishDataStatistics(testInputDir, 1234L, conf);
    assertEquals("Data size mismatch", size, stats.getDataSize());
    assertEquals("Num files mismatch", 1, stats.getNumFiles());
    assertFalse("Compression configuration mismatch", stats.isDataCompressed());
    // test with some compressed input data (compression = on)
    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
    stats = GenerateData.publishDataStatistics(testInputDir, 1234L, conf);
    assertEquals("Data size mismatch", size, stats.getDataSize());
    assertEquals("Num files mismatch", 1, stats.getNumFiles());
    assertTrue("Compression configuration mismatch", stats.isDataCompressed());
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) DataStatistics(org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics) Test(org.junit.Test)

Aggregations

DataStatistics (org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics)4 FileSystem (org.apache.hadoop.fs.FileSystem)3 Path (org.apache.hadoop.fs.Path)3 Configuration (org.apache.hadoop.conf.Configuration)2 Test (org.junit.Test)2 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 CountDownLatch (java.util.concurrent.CountDownLatch)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)1 CompressionCodecFactory (org.apache.hadoop.io.compress.CompressionCodecFactory)1 JobStats (org.apache.hadoop.mapred.gridmix.Statistics.JobStats)1