Search in sources :

Example 16 with CountingInputStream

use of com.google.common.io.CountingInputStream in project alluxio by Alluxio.

the class ObjectUnderFileInputStream method openStream.

/**
   * Open a new stream.
   *
   * @param options for opening a stream
   * @throws IOException if a non-Alluxio error occurs
   */
private void openStream(OpenOptions options) throws IOException {
    if (mStream != null) {
        mStream.close();
    }
    mInitPos = options.getOffset();
    mStream = new CountingInputStream(mUfs.openObject(mKey, options));
}
Also used : CountingInputStream(com.google.common.io.CountingInputStream)

Example 17 with CountingInputStream

use of com.google.common.io.CountingInputStream in project beam by apache.

the class CoderProperties method decode.

@VisibleForTesting
static <T> T decode(Coder<T> coder, Coder.Context context, byte[] bytes) throws CoderException, IOException {
    @SuppressWarnings("unchecked") Coder<T> deserializedCoder = SerializableUtils.clone(coder);
    byte[] buffer;
    if (context == Coder.Context.NESTED) {
        buffer = new byte[bytes.length + 1];
        System.arraycopy(bytes, 0, buffer, 0, bytes.length);
        buffer[bytes.length] = 1;
    } else {
        buffer = bytes;
    }
    CountingInputStream cis = new CountingInputStream(new ByteArrayInputStream(buffer));
    T value = deserializedCoder.decode(new UnownedInputStream(cis), context);
    assertThat("consumed bytes equal to encoded bytes", cis.getCount(), equalTo((long) bytes.length));
    return value;
}
Also used : UnownedInputStream(org.apache.beam.sdk.util.UnownedInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) CountingInputStream(com.google.common.io.CountingInputStream) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 18 with CountingInputStream

use of com.google.common.io.CountingInputStream in project jackrabbit-oak by apache.

the class BinaryTextExtractor method parseStringValue0.

private String parseStringValue0(Blob v, Metadata metadata, String path) {
    WriteOutContentHandler handler = new WriteOutContentHandler(definition.getMaxExtractLength());
    long start = System.currentTimeMillis();
    long bytesRead = 0;
    long length = v.length();
    if (log.isDebugEnabled()) {
        log.debug("Extracting {}, {} bytes, id {}", path, length, v.getContentIdentity());
    }
    String oldThreadName = null;
    if (length > SMALL_BINARY) {
        Thread t = Thread.currentThread();
        oldThreadName = t.getName();
        t.setName(oldThreadName + ": Extracting " + path + ", " + length + " bytes");
    }
    try {
        CountingInputStream stream = new CountingInputStream(new LazyInputStream(new BlobByteSource(v)));
        try {
            getParser().parse(stream, handler, metadata, new ParseContext());
        } finally {
            bytesRead = stream.getCount();
            stream.close();
        }
    } catch (LinkageError e) {
    // Capture and ignore errors caused by extraction libraries
    // not being present. This is equivalent to disabling
    // selected media types in configuration, so we can simply
    // ignore these errors.
    } catch (Throwable t) {
        // The special STOP exception is used for normal termination.
        if (!handler.isWriteLimitReached(t)) {
            log.debug("[{}] Failed to extract text from a binary property: {}." + " This is a fairly common case, and nothing to" + " worry about. The stack trace is included to" + " help improve the text extraction feature.", getIndexName(), path, t);
            extractedTextCache.put(v, ExtractedText.ERROR);
            return TEXT_EXTRACTION_ERROR;
        }
    } finally {
        if (oldThreadName != null) {
            Thread.currentThread().setName(oldThreadName);
        }
    }
    String result = handler.toString();
    if (bytesRead > 0) {
        long time = System.currentTimeMillis() - start;
        int len = result.length();
        recordTextExtractionStats(time, bytesRead, len);
        if (log.isDebugEnabled()) {
            log.debug("Extracting {} took {} ms, {} bytes read, {} text size", path, time, bytesRead, len);
        }
    }
    extractedTextCache.put(v, new ExtractedText(ExtractedText.ExtractionResult.SUCCESS, result));
    return result;
}
Also used : WriteOutContentHandler(org.apache.tika.sax.WriteOutContentHandler) LazyInputStream(org.apache.jackrabbit.oak.commons.io.LazyInputStream) CountingInputStream(com.google.common.io.CountingInputStream) ParseContext(org.apache.tika.parser.ParseContext) ExtractedText(org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText)

Example 19 with CountingInputStream

use of com.google.common.io.CountingInputStream in project jackrabbit-oak by apache.

the class TextExtractor method parseStringValue.

//~--------------------------------------< Tika >
private String parseStringValue(ByteSource byteSource, Metadata metadata, String path) {
    WriteOutContentHandler handler = new WriteOutContentHandler(maxExtractedLength);
    long start = System.currentTimeMillis();
    long size = 0;
    try {
        CountingInputStream stream = new CountingInputStream(new LazyInputStream(byteSource));
        try {
            tika.getParser().parse(stream, handler, metadata, new ParseContext());
        } finally {
            size = stream.getCount();
            stream.close();
        }
    } catch (LinkageError e) {
    // Capture and ignore errors caused by extraction libraries
    // not being present. This is equivalent to disabling
    // selected media types in configuration, so we can simply
    // ignore these errors.
    } catch (Throwable t) {
        // The special STOP exception is used for normal termination.
        if (!handler.isWriteLimitReached(t)) {
            parserErrorCount.incrementAndGet();
            parserError.debug("Failed to extract text from a binary property: " + path + " This is a fairly common case, and nothing to" + " worry about. The stack trace is included to" + " help improve the text extraction feature.", t);
            return ERROR_TEXT;
        }
    }
    String result = handler.toString();
    timeTaken.addAndGet(System.currentTimeMillis() - start);
    if (size > 0) {
        extractedTextSize.addAndGet(result.length());
        extractionCount.incrementAndGet();
        totalSizeRead.addAndGet(size);
        return result;
    }
    return null;
}
Also used : WriteOutContentHandler(org.apache.tika.sax.WriteOutContentHandler) LazyInputStream(org.apache.jackrabbit.oak.commons.io.LazyInputStream) CountingInputStream(com.google.common.io.CountingInputStream) ParseContext(org.apache.tika.parser.ParseContext)

Example 20 with CountingInputStream

use of com.google.common.io.CountingInputStream in project jackrabbit-oak by apache.

the class IndexConsistencyChecker method checkBlob.

private void checkBlob(String propName, Blob blob, Tree tree, Result result) {
    String id = blob.getContentIdentity();
    String blobPath = String.format("%s/%s/%s", tree.getPath(), propName, id);
    try {
        InputStream is = blob.getNewStream();
        CountingInputStream cis = new CountingInputStream(is);
        IOUtils.copyLarge(cis, ByteStreams.nullOutputStream());
        if (cis.getCount() != blob.length()) {
            String msg = String.format("Invalid blob %s. Length mismatch - expected ${%d} -> found ${%d}", blobPath, blob.length(), cis.getCount());
            result.invalidBlobIds.add(new FileSizeStatus(blobPath, cis.getCount(), blob.length()));
            log.warn("[{}] {}", indexPath, msg);
            result.clean = false;
            result.blobSizeMismatch = true;
        }
        result.binaryPropSize += cis.getCount();
    } catch (Exception e) {
        log.warn("[{}] Error occurred reading blob at {}", indexPath, blobPath, e);
        result.missingBlobIds.add(id);
        result.clean = false;
        result.missingBlobs = true;
    }
}
Also used : CountingInputStream(com.google.common.io.CountingInputStream) InputStream(java.io.InputStream) CountingInputStream(com.google.common.io.CountingInputStream) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException)

Aggregations

CountingInputStream (com.google.common.io.CountingInputStream)20 ByteArrayInputStream (java.io.ByteArrayInputStream)12 DataInputStream (java.io.DataInputStream)12 CountingOutputStream (com.google.common.io.CountingOutputStream)11 ByteArrayOutputStream (java.io.ByteArrayOutputStream)11 DataOutputStream (java.io.DataOutputStream)11 Test (org.junit.Test)11 KeyValue (org.apache.hadoop.hbase.KeyValue)8 Cell (org.apache.hadoop.hbase.Cell)4 IOException (java.io.IOException)2 InputStream (java.io.InputStream)2 ArrayBackedTag (org.apache.hadoop.hbase.ArrayBackedTag)2 Tag (org.apache.hadoop.hbase.Tag)2 LazyInputStream (org.apache.jackrabbit.oak.commons.io.LazyInputStream)2 ParseContext (org.apache.tika.parser.ParseContext)2 WriteOutContentHandler (org.apache.tika.sax.WriteOutContentHandler)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Stopwatch (com.google.common.base.Stopwatch)1 FileNotFoundException (java.io.FileNotFoundException)1 FilterInputStream (java.io.FilterInputStream)1