Search in sources :

Example 1 with Lifecycle

use of com.metamx.common.lifecycle.Lifecycle in project hive by apache.

the class DruidStorageHandler method commitCreateTable.

@Override
public void commitCreateTable(Table table) throws MetaException {
    if (MetaStoreUtils.isExternalTable(table)) {
        return;
    }
    Lifecycle lifecycle = new Lifecycle();
    LOG.info(String.format("Committing table [%s] to the druid metastore", table.getDbName()));
    final Path tableDir = getSegmentDescriptorDir();
    try {
        List<DataSegment> segmentList = DruidStorageHandlerUtils.getPublishedSegments(tableDir, getConf());
        LOG.info(String.format("Found [%d] segments under path [%s]", segmentList.size(), tableDir));
        druidSqlMetadataStorageUpdaterJobHandler.publishSegments(druidMetadataStorageTablesConfig.getSegmentsTable(), segmentList, DruidStorageHandlerUtils.JSON_MAPPER);
        final String coordinatorAddress = HiveConf.getVar(getConf(), HiveConf.ConfVars.HIVE_DRUID_COORDINATOR_DEFAULT_ADDRESS);
        int maxTries = HiveConf.getIntVar(getConf(), HiveConf.ConfVars.HIVE_DRUID_MAX_TRIES);
        final String dataSourceName = table.getParameters().get(Constants.DRUID_DATA_SOURCE);
        LOG.info(String.format("checking load status from coordinator [%s]", coordinatorAddress));
        // check if the coordinator is up
        httpClient = makeHttpClient(lifecycle);
        try {
            lifecycle.start();
        } catch (Exception e) {
            Throwables.propagate(e);
        }
        String coordinatorResponse = null;
        try {
            coordinatorResponse = RetryUtils.retry(new Callable<String>() {

                @Override
                public String call() throws Exception {
                    return DruidStorageHandlerUtils.getURL(httpClient, new URL(String.format("http://%s/status", coordinatorAddress)));
                }
            }, new Predicate<Throwable>() {

                @Override
                public boolean apply(@Nullable Throwable input) {
                    return input instanceof IOException;
                }
            }, maxTries);
        } catch (Exception e) {
            console.printInfo("Will skip waiting for data loading");
            return;
        }
        if (Strings.isNullOrEmpty(coordinatorResponse)) {
            console.printInfo("Will skip waiting for data loading");
            return;
        }
        console.printInfo(String.format("Waiting for the loading of [%s] segments", segmentList.size()));
        long passiveWaitTimeMs = HiveConf.getLongVar(getConf(), HiveConf.ConfVars.HIVE_DRUID_PASSIVE_WAIT_TIME);
        ImmutableSet<URL> setOfUrls = FluentIterable.from(segmentList).transform(new Function<DataSegment, URL>() {

            @Override
            public URL apply(DataSegment dataSegment) {
                try {
                    //Need to make sure that we are using UTC since most of the druid cluster use UTC by default
                    return new URL(String.format("http://%s/druid/coordinator/v1/datasources/%s/segments/%s", coordinatorAddress, dataSourceName, DataSegment.makeDataSegmentIdentifier(dataSegment.getDataSource(), new DateTime(dataSegment.getInterval().getStartMillis(), DateTimeZone.UTC), new DateTime(dataSegment.getInterval().getEndMillis(), DateTimeZone.UTC), dataSegment.getVersion(), dataSegment.getShardSpec())));
                } catch (MalformedURLException e) {
                    Throwables.propagate(e);
                }
                return null;
            }
        }).toSet();
        int numRetries = 0;
        while (numRetries++ < maxTries && !setOfUrls.isEmpty()) {
            setOfUrls = ImmutableSet.copyOf(Sets.filter(setOfUrls, new Predicate<URL>() {

                @Override
                public boolean apply(URL input) {
                    try {
                        String result = DruidStorageHandlerUtils.getURL(httpClient, input);
                        LOG.debug(String.format("Checking segment [%s] response is [%s]", input, result));
                        return Strings.isNullOrEmpty(result);
                    } catch (IOException e) {
                        LOG.error(String.format("Error while checking URL [%s]", input), e);
                        return true;
                    }
                }
            }));
            try {
                if (!setOfUrls.isEmpty()) {
                    Thread.sleep(passiveWaitTimeMs);
                }
            } catch (InterruptedException e) {
                Thread.interrupted();
                Throwables.propagate(e);
            }
        }
        if (!setOfUrls.isEmpty()) {
            // We are not Throwing an exception since it might be a transient issue that is blocking loading
            console.printError(String.format("Wait time exhausted and we have [%s] out of [%s] segments not loaded yet", setOfUrls.size(), segmentList.size()));
        }
    } catch (IOException e) {
        LOG.error("Exception while commit", e);
        Throwables.propagate(e);
    } finally {
        cleanWorkingDir();
        lifecycle.stop();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MalformedURLException(java.net.MalformedURLException) Lifecycle(com.metamx.common.lifecycle.Lifecycle) IOException(java.io.IOException) DataSegment(io.druid.timeline.DataSegment) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) SegmentLoadingException(io.druid.segment.loading.SegmentLoadingException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) Callable(java.util.concurrent.Callable) URL(java.net.URL) DateTime(org.joda.time.DateTime) Predicate(com.google.common.base.Predicate) Function(com.google.common.base.Function) Nullable(javax.annotation.Nullable)

Example 2 with Lifecycle

use of com.metamx.common.lifecycle.Lifecycle in project hive by apache.

the class DruidQueryBasedInputFormat method splitSelectQuery.

/* Method that splits Select query depending on the threshold so read can be
   * parallelized. We will only contact the Druid broker to obtain all results. */
private static HiveDruidSplit[] splitSelectQuery(Configuration conf, String address, SelectQuery query, Path dummyPath) throws IOException {
    final int selectThreshold = (int) HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_DRUID_SELECT_THRESHOLD);
    final int numConnection = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_DRUID_NUM_HTTP_CONNECTION);
    final Period readTimeout = new Period(HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_DRUID_HTTP_READ_TIMEOUT));
    final boolean isFetch = query.getContextBoolean(Constants.DRUID_QUERY_FETCH, false);
    if (isFetch) {
        // If it has a limit, we use it and we do not split the query
        return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
    }
    // We do not have the number of rows, thus we need to execute a
    // Segment Metadata query to obtain number of rows
    SegmentMetadataQueryBuilder metadataBuilder = new Druids.SegmentMetadataQueryBuilder();
    metadataBuilder.dataSource(query.getDataSource());
    metadataBuilder.intervals(query.getIntervals());
    metadataBuilder.merge(true);
    metadataBuilder.analysisTypes();
    SegmentMetadataQuery metadataQuery = metadataBuilder.build();
    Lifecycle lifecycle = new Lifecycle();
    HttpClient client = HttpClientInit.createClient(HttpClientConfig.builder().withNumConnections(numConnection).withReadTimeout(readTimeout.toStandardDuration()).build(), lifecycle);
    try {
        lifecycle.start();
    } catch (Exception e) {
        LOG.error("Lifecycle start issue");
        throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
    }
    InputStream response;
    try {
        response = DruidStorageHandlerUtils.submitRequest(client, DruidStorageHandlerUtils.createRequest(address, metadataQuery));
    } catch (Exception e) {
        lifecycle.stop();
        throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
    }
    // Retrieve results
    List<SegmentAnalysis> metadataList;
    try {
        metadataList = DruidStorageHandlerUtils.SMILE_MAPPER.readValue(response, new TypeReference<List<SegmentAnalysis>>() {
        });
    } catch (Exception e) {
        response.close();
        throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
    } finally {
        lifecycle.stop();
    }
    if (metadataList == null) {
        throw new IOException("Connected to Druid but could not retrieve datasource information");
    }
    if (metadataList.isEmpty()) {
        // There are no rows for that time range, we can submit query as it is
        return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
    }
    if (metadataList.size() != 1) {
        throw new IOException("Information about segments should have been merged");
    }
    final long numRows = metadataList.get(0).getNumRows();
    query = query.withPagingSpec(PagingSpec.newSpec(Integer.MAX_VALUE));
    if (numRows <= selectThreshold) {
        // We are not going to split it
        return new HiveDruidSplit[] { new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(query), dummyPath, new String[] { address }) };
    }
    // If the query does not specify a timestamp, we obtain the total time using
    // a Time Boundary query. Then, we use the information to split the query
    // following the Select threshold configuration property
    final List<Interval> intervals = new ArrayList<>();
    if (query.getIntervals().size() == 1 && query.getIntervals().get(0).withChronology(ISOChronology.getInstanceUTC()).equals(DruidTable.DEFAULT_INTERVAL)) {
        // Default max and min, we should execute a time boundary query to get a
        // more precise range
        TimeBoundaryQueryBuilder timeBuilder = new Druids.TimeBoundaryQueryBuilder();
        timeBuilder.dataSource(query.getDataSource());
        TimeBoundaryQuery timeQuery = timeBuilder.build();
        lifecycle = new Lifecycle();
        client = HttpClientInit.createClient(HttpClientConfig.builder().withNumConnections(numConnection).withReadTimeout(readTimeout.toStandardDuration()).build(), lifecycle);
        try {
            lifecycle.start();
        } catch (Exception e) {
            LOG.error("Lifecycle start issue");
            throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
        }
        try {
            response = DruidStorageHandlerUtils.submitRequest(client, DruidStorageHandlerUtils.createRequest(address, timeQuery));
        } catch (Exception e) {
            lifecycle.stop();
            throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
        }
        // Retrieve results
        List<Result<TimeBoundaryResultValue>> timeList;
        try {
            timeList = DruidStorageHandlerUtils.SMILE_MAPPER.readValue(response, new TypeReference<List<Result<TimeBoundaryResultValue>>>() {
            });
        } catch (Exception e) {
            response.close();
            throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
        } finally {
            lifecycle.stop();
        }
        if (timeList == null || timeList.isEmpty()) {
            throw new IOException("Connected to Druid but could not retrieve time boundary information");
        }
        if (timeList.size() != 1) {
            throw new IOException("We should obtain a single time boundary");
        }
        intervals.add(new Interval(timeList.get(0).getValue().getMinTime().getMillis(), timeList.get(0).getValue().getMaxTime().getMillis(), ISOChronology.getInstanceUTC()));
    } else {
        intervals.addAll(query.getIntervals());
    }
    // Create (numRows/default threshold) input splits
    int numSplits = (int) Math.ceil((double) numRows / selectThreshold);
    List<List<Interval>> newIntervals = createSplitsIntervals(intervals, numSplits);
    HiveDruidSplit[] splits = new HiveDruidSplit[numSplits];
    for (int i = 0; i < numSplits; i++) {
        // Create partial Select query
        final SelectQuery partialQuery = query.withQuerySegmentSpec(new MultipleIntervalSegmentSpec(newIntervals.get(i)));
        splits[i] = new HiveDruidSplit(DruidStorageHandlerUtils.JSON_MAPPER.writeValueAsString(partialQuery), dummyPath, new String[] { address });
    }
    return splits;
}
Also used : ArrayList(java.util.ArrayList) MultipleIntervalSegmentSpec(io.druid.query.spec.MultipleIntervalSegmentSpec) TimeBoundaryQuery(io.druid.query.timeboundary.TimeBoundaryQuery) Result(io.druid.query.Result) SegmentMetadataQuery(io.druid.query.metadata.metadata.SegmentMetadataQuery) SegmentMetadataQueryBuilder(io.druid.query.Druids.SegmentMetadataQueryBuilder) SegmentAnalysis(io.druid.query.metadata.metadata.SegmentAnalysis) TimeBoundaryQueryBuilder(io.druid.query.Druids.TimeBoundaryQueryBuilder) List(java.util.List) ArrayList(java.util.ArrayList) TypeReference(com.fasterxml.jackson.core.type.TypeReference) InputStream(java.io.InputStream) Lifecycle(com.metamx.common.lifecycle.Lifecycle) Period(org.joda.time.Period) IOException(java.io.IOException) JsonParseException(com.fasterxml.jackson.core.JsonParseException) JsonMappingException(com.fasterxml.jackson.databind.JsonMappingException) IOException(java.io.IOException) SelectQuery(io.druid.query.select.SelectQuery) HttpClient(com.metamx.http.client.HttpClient) Interval(org.joda.time.Interval)

Example 3 with Lifecycle

use of com.metamx.common.lifecycle.Lifecycle in project hive by apache.

the class DruidQueryRecordReader method initialize.

public void initialize(InputSplit split, Configuration conf) throws IOException {
    HiveDruidSplit hiveDruidSplit = (HiveDruidSplit) split;
    // Create query
    query = createQuery(hiveDruidSplit.getDruidQuery());
    // Execute query
    if (LOG.isInfoEnabled()) {
        LOG.info("Retrieving from druid using query:\n " + query);
    }
    final Lifecycle lifecycle = new Lifecycle();
    final int numConnection = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_DRUID_NUM_HTTP_CONNECTION);
    final Period readTimeout = new Period(HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_DRUID_HTTP_READ_TIMEOUT));
    HttpClient client = HttpClientInit.createClient(HttpClientConfig.builder().withReadTimeout(readTimeout.toStandardDuration()).withNumConnections(numConnection).build(), lifecycle);
    try {
        lifecycle.start();
    } catch (Exception e) {
        LOG.error("Issues with lifecycle start", e);
    }
    InputStream response;
    try {
        response = DruidStorageHandlerUtils.submitRequest(client, DruidStorageHandlerUtils.createRequest(hiveDruidSplit.getLocations()[0], query));
    } catch (Exception e) {
        lifecycle.stop();
        throw new IOException(org.apache.hadoop.util.StringUtils.stringifyException(e));
    }
    // Retrieve results
    List<R> resultsList;
    try {
        resultsList = createResultsList(response);
    } catch (IOException e) {
        response.close();
        throw e;
    } finally {
        lifecycle.stop();
    }
    if (resultsList == null || resultsList.isEmpty()) {
        return;
    }
    results = resultsList.iterator();
}
Also used : HiveDruidSplit(org.apache.hadoop.hive.druid.io.HiveDruidSplit) InputStream(java.io.InputStream) Lifecycle(com.metamx.common.lifecycle.Lifecycle) HttpClient(com.metamx.http.client.HttpClient) Period(org.joda.time.Period) IOException(java.io.IOException) IOException(java.io.IOException)

Aggregations

Lifecycle (com.metamx.common.lifecycle.Lifecycle)3 IOException (java.io.IOException)3 HttpClient (com.metamx.http.client.HttpClient)2 InputStream (java.io.InputStream)2 Period (org.joda.time.Period)2 JsonParseException (com.fasterxml.jackson.core.JsonParseException)1 TypeReference (com.fasterxml.jackson.core.type.TypeReference)1 JsonMappingException (com.fasterxml.jackson.databind.JsonMappingException)1 Function (com.google.common.base.Function)1 Predicate (com.google.common.base.Predicate)1 SegmentMetadataQueryBuilder (io.druid.query.Druids.SegmentMetadataQueryBuilder)1 TimeBoundaryQueryBuilder (io.druid.query.Druids.TimeBoundaryQueryBuilder)1 Result (io.druid.query.Result)1 SegmentAnalysis (io.druid.query.metadata.metadata.SegmentAnalysis)1 SegmentMetadataQuery (io.druid.query.metadata.metadata.SegmentMetadataQuery)1 SelectQuery (io.druid.query.select.SelectQuery)1 MultipleIntervalSegmentSpec (io.druid.query.spec.MultipleIntervalSegmentSpec)1 TimeBoundaryQuery (io.druid.query.timeboundary.TimeBoundaryQuery)1 SegmentLoadingException (io.druid.segment.loading.SegmentLoadingException)1 DataSegment (io.druid.timeline.DataSegment)1