Search in sources :

Example 1 with URIDataPuller

use of org.apache.druid.segment.loading.URIDataPuller in project druid by druid-io.

the class UriCacheGenerator method generateCache.

@Override
@Nullable
public CacheScheduler.VersionedCache generateCache(final UriExtractionNamespace extractionNamespace, final CacheScheduler.EntryImpl<UriExtractionNamespace> entryId, @Nullable final String lastVersion, final CacheScheduler scheduler) throws Exception {
    final boolean doSearch = extractionNamespace.getUriPrefix() != null;
    final URI originalUri = doSearch ? extractionNamespace.getUriPrefix() : extractionNamespace.getUri();
    final SearchableVersionedDataFinder<URI> pullerRaw = pullers.get(originalUri.getScheme());
    if (pullerRaw == null) {
        throw new IAE("Unknown loader type[%s].  Known types are %s", originalUri.getScheme(), pullers.keySet());
    }
    if (!(pullerRaw instanceof URIDataPuller)) {
        throw new IAE("Cannot load data from location [%s]. Data pulling from [%s] not supported", originalUri, originalUri.getScheme());
    }
    final URIDataPuller puller = (URIDataPuller) pullerRaw;
    final URI uri;
    if (doSearch) {
        final Pattern versionRegex;
        if (extractionNamespace.getFileRegex() != null) {
            versionRegex = Pattern.compile(extractionNamespace.getFileRegex());
        } else {
            versionRegex = null;
        }
        uri = pullerRaw.getLatestVersion(extractionNamespace.getUriPrefix(), versionRegex);
        if (uri == null) {
            throw new FileNotFoundException(StringUtils.format("Could not find match for pattern `%s` in [%s] for %s", versionRegex, originalUri, extractionNamespace));
        }
    } else {
        uri = extractionNamespace.getUri();
    }
    return RetryUtils.retry(() -> {
        final String version = puller.getVersion(uri);
        try {
            // Important to call equals() against version because lastVersion could be null
            if (version.equals(lastVersion)) {
                log.debug("URI [%s] for [%s] has the same last modified time [%s] as the last cached. " + "Skipping ", uri.toString(), entryId, version);
                return null;
            }
        } catch (NumberFormatException ex) {
            log.debug(ex, "Failed to get last modified timestamp. Assuming no timestamp");
        }
        final ByteSource source = new ByteSource() {

            @Override
            public InputStream openStream() throws IOException {
                return CompressionUtils.decompress(puller.getInputStream(uri), uri.getPath());
            }
        };
        final CacheScheduler.VersionedCache versionedCache = scheduler.createVersionedCache(entryId, version);
        try {
            final long startNs = System.nanoTime();
            final MapPopulator.PopulateResult populateResult = new MapPopulator<>(extractionNamespace.getNamespaceParseSpec().getParser()).populateAndWarnAtByteLimit(source, versionedCache.getCache(), (long) (MAX_MEMORY * extractionNamespace.getMaxHeapPercentage() / 100.0), null == entryId ? null : entryId.toString());
            final long duration = System.nanoTime() - startNs;
            log.info("Finished loading %,d values (%d bytes) from %,d lines for [%s] in %,d ns", populateResult.getEntries(), populateResult.getBytes(), populateResult.getLines(), entryId, duration);
            return versionedCache;
        } catch (Throwable t) {
            try {
                versionedCache.close();
            } catch (Exception e) {
                t.addSuppressed(e);
            }
            throw t;
        }
    }, puller.shouldRetryPredicate(), DEFAULT_NUM_RETRIES);
}
Also used : Pattern(java.util.regex.Pattern) FileNotFoundException(java.io.FileNotFoundException) IAE(org.apache.druid.java.util.common.IAE) URI(java.net.URI) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) URIDataPuller(org.apache.druid.segment.loading.URIDataPuller) ByteSource(com.google.common.io.ByteSource) CacheScheduler(org.apache.druid.server.lookup.namespace.cache.CacheScheduler) MapPopulator(org.apache.druid.data.input.MapPopulator) Nullable(javax.annotation.Nullable)

Aggregations

ByteSource (com.google.common.io.ByteSource)1 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1 URI (java.net.URI)1 Pattern (java.util.regex.Pattern)1 Nullable (javax.annotation.Nullable)1 MapPopulator (org.apache.druid.data.input.MapPopulator)1 IAE (org.apache.druid.java.util.common.IAE)1 URIDataPuller (org.apache.druid.segment.loading.URIDataPuller)1 CacheScheduler (org.apache.druid.server.lookup.namespace.cache.CacheScheduler)1