Search in sources :

Example 1 with BeamSqlPipelineOptions

use of org.apache.beam.sdk.extensions.sql.impl.BeamSqlPipelineOptions in project beam by apache.

the class BeamSqlEnvRunner method runUsingBeamSqlEnv.

/**
 * This is the alternative method in BeamTpcds.main method. Run job using BeamSqlEnv.parseQuery()
 * method. (Doesn't perform well when running query96).
 *
 * @param args Command line arguments
 * @throws Exception
 */
public static void runUsingBeamSqlEnv(String[] args) throws Exception {
    InMemoryMetaStore inMemoryMetaStore = new InMemoryMetaStore();
    inMemoryMetaStore.registerProvider(new TextTableProvider());
    TpcdsOptions tpcdsOptions = PipelineOptionsFactory.fromArgs(args).withValidation().as(TpcdsOptions.class);
    String dataSize = TpcdsParametersReader.getAndCheckDataSize(tpcdsOptions);
    String[] queryNames = TpcdsParametersReader.getAndCheckQueryNames(tpcdsOptions);
    int nThreads = TpcdsParametersReader.getAndCheckTpcParallel(tpcdsOptions);
    // Using ExecutorService and CompletionService to fulfill multi-threading functionality
    ExecutorService executor = Executors.newFixedThreadPool(nThreads);
    CompletionService<TpcdsRunResult> completion = new ExecutorCompletionService<>(executor);
    // Directly create all tables and register them into inMemoryMetaStore before creating
    // BeamSqlEnv object.
    registerAllTablesByInMemoryMetaStore(inMemoryMetaStore, dataSize);
    BeamSqlPipelineOptions beamSqlPipelineOptions = tpcdsOptions.as(BeamSqlPipelineOptions.class);
    BeamSqlEnv env = BeamSqlEnv.builder(inMemoryMetaStore).setPipelineOptions(beamSqlPipelineOptions).setQueryPlannerClassName(beamSqlPipelineOptions.getPlannerName()).build();
    // Make an array of pipelines, each pipeline is responsible for running a corresponding query.
    Pipeline[] pipelines = new Pipeline[queryNames.length];
    // the txt file and store in a GCP directory.
    for (int i = 0; i < queryNames.length; i++) {
        // For each query, get a copy of pipelineOptions from command line arguments, cast
        // tpcdsOptions as a DataflowPipelineOptions object to read and set required parameters for
        // pipeline execution.
        TpcdsOptions tpcdsOptionsCopy = PipelineOptionsFactory.fromArgs(args).withValidation().as(TpcdsOptions.class);
        DataflowPipelineOptions dataflowPipelineOptionsCopy = tpcdsOptionsCopy.as(DataflowPipelineOptions.class);
        // Set a unique job name using the time stamp so that multiple different pipelines can run
        // together.
        dataflowPipelineOptionsCopy.setJobName(queryNames[i] + "result" + System.currentTimeMillis());
        pipelines[i] = Pipeline.create(dataflowPipelineOptionsCopy);
        String queryString = QueryReader.readQuery(queryNames[i]);
        try {
            // Query execution
            PCollection<Row> rows = BeamSqlRelUtils.toPCollection(pipelines[i], env.parseQuery(queryString));
            // Transform the result from PCollection<Row> into PCollection<String>, and write it to the
            // location where results are stored.
            PCollection<String> rowStrings = rows.apply(MapElements.into(TypeDescriptors.strings()).via(Row::toString));
            rowStrings.apply(TextIO.write().to(RESULT_DIRECTORY + "/" + dataSize + "/" + pipelines[i].getOptions().getJobName()).withSuffix(".txt").withNumShards(1));
        } catch (Exception e) {
            LOG.error("{} failed to execute", queryNames[i]);
            e.printStackTrace();
        }
        completion.submit(new TpcdsRun(pipelines[i]));
    }
    executor.shutdown();
    printExecutionSummary(completion, queryNames.length);
}
Also used : DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) TextTableProvider(org.apache.beam.sdk.extensions.sql.meta.provider.text.TextTableProvider) ExecutorCompletionService(java.util.concurrent.ExecutorCompletionService) Pipeline(org.apache.beam.sdk.Pipeline) BeamSqlPipelineOptions(org.apache.beam.sdk.extensions.sql.impl.BeamSqlPipelineOptions) ExecutorService(java.util.concurrent.ExecutorService) BeamSqlEnv(org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv) Row(org.apache.beam.sdk.values.Row) InMemoryMetaStore(org.apache.beam.sdk.extensions.sql.meta.store.InMemoryMetaStore)

Aggregations

ExecutorCompletionService (java.util.concurrent.ExecutorCompletionService)1 ExecutorService (java.util.concurrent.ExecutorService)1 DataflowPipelineOptions (org.apache.beam.runners.dataflow.options.DataflowPipelineOptions)1 Pipeline (org.apache.beam.sdk.Pipeline)1 BeamSqlEnv (org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv)1 BeamSqlPipelineOptions (org.apache.beam.sdk.extensions.sql.impl.BeamSqlPipelineOptions)1 TextTableProvider (org.apache.beam.sdk.extensions.sql.meta.provider.text.TextTableProvider)1 InMemoryMetaStore (org.apache.beam.sdk.extensions.sql.meta.store.InMemoryMetaStore)1 Row (org.apache.beam.sdk.values.Row)1