Search in sources :

Example 1 with ChmBlockInfo

use of org.apache.tika.parser.chm.lzx.ChmBlockInfo in project tika by apache.

the class ChmExtractor method extractChmEntry.

/**
     * Decompresses a chm entry
     * 
     * @param directoryListingEntry
     * 
     * @return decompressed data
     * @throws TikaException 
     */
public byte[] extractChmEntry(DirectoryListingEntry directoryListingEntry) throws TikaException {
    ByteArrayOutputStream buffer = new ByteArrayOutputStream();
    ChmLzxBlock lzxBlock = null;
    try {
        /* UNCOMPRESSED type is easiest one */
        if (directoryListingEntry.getEntryType() == EntryType.UNCOMPRESSED && directoryListingEntry.getLength() > 0 && !ChmCommons.hasSkip(directoryListingEntry)) {
            int dataOffset = (int) (getChmItsfHeader().getDataOffset() + directoryListingEntry.getOffset());
            // dataSegment = Arrays.copyOfRange(getData(), dataOffset,
            // dataOffset + directoryListingEntry.getLength());
            buffer.write(ChmCommons.copyOfRange(getData(), dataOffset, dataOffset + directoryListingEntry.getLength()));
        } else if (directoryListingEntry.getEntryType() == EntryType.COMPRESSED && !ChmCommons.hasSkip(directoryListingEntry)) {
            /* Gets a chm hit_cache info */
            ChmBlockInfo bb = ChmBlockInfo.getChmBlockInfoInstance(directoryListingEntry, (int) getChmLzxcResetTable().getBlockLen(), getChmLzxcControlData());
            int i = 0, start = 0, hit_cache = 0;
            if ((getLzxBlockLength() < Integer.MAX_VALUE) && (getLzxBlockOffset() < Integer.MAX_VALUE)) {
                // TODO: Improve the caching
                // caching ... = O(n^2) - depends on startBlock and endBlock
                start = -1;
                if (!getLzxBlocksCache().isEmpty()) {
                    for (i = 0; i < getLzxBlocksCache().size(); i++) {
                        //lzxBlock = getLzxBlocksCache().get(i);
                        int bn = getLzxBlocksCache().get(i).getBlockNumber();
                        for (int j = bb.getIniBlock(); j <= bb.getStartBlock(); j++) {
                            if (bn == j) {
                                if (j > start) {
                                    start = j;
                                    hit_cache = i;
                                }
                            }
                        }
                        if (start == bb.getStartBlock())
                            break;
                    }
                }
                //                    if (i == getLzxBlocksCache().size() && i == 0) {
                if (start < 0) {
                    start = bb.getIniBlock();
                    byte[] dataSegment = ChmCommons.getChmBlockSegment(getData(), getChmLzxcResetTable(), start, (int) getLzxBlockOffset(), (int) getLzxBlockLength());
                    lzxBlock = new ChmLzxBlock(start, dataSegment, getChmLzxcResetTable().getBlockLen(), null);
                    getLzxBlocksCache().add(lzxBlock);
                } else {
                    lzxBlock = getLzxBlocksCache().get(hit_cache);
                }
                for (i = start; i <= bb.getEndBlock(); ) {
                    if (i == bb.getStartBlock() && i == bb.getEndBlock()) {
                        buffer.write(lzxBlock.getContent(bb.getStartOffset(), bb.getEndOffset()));
                        break;
                    }
                    if (i == bb.getStartBlock()) {
                        buffer.write(lzxBlock.getContent(bb.getStartOffset()));
                    }
                    if (i > bb.getStartBlock() && i < bb.getEndBlock()) {
                        buffer.write(lzxBlock.getContent());
                    }
                    if (i == bb.getEndBlock()) {
                        buffer.write(lzxBlock.getContent(0, bb.getEndOffset()));
                        break;
                    }
                    i++;
                    if (i % getChmLzxcControlData().getResetInterval() == 0) {
                        lzxBlock = new ChmLzxBlock(i, ChmCommons.getChmBlockSegment(getData(), getChmLzxcResetTable(), i, (int) getLzxBlockOffset(), (int) getLzxBlockLength()), getChmLzxcResetTable().getBlockLen(), null);
                    } else {
                        lzxBlock = new ChmLzxBlock(i, ChmCommons.getChmBlockSegment(getData(), getChmLzxcResetTable(), i, (int) getLzxBlockOffset(), (int) getLzxBlockLength()), getChmLzxcResetTable().getBlockLen(), lzxBlock);
                    }
                    getLzxBlocksCache().add(lzxBlock);
                }
                if (getLzxBlocksCache().size() > getChmLzxcResetTable().getBlockCount()) {
                    getLzxBlocksCache().clear();
                }
            }
            if (buffer.size() != directoryListingEntry.getLength()) {
                throw new TikaException("CHM file extract error: extracted Length is wrong.");
            }
        }
    //end of if compressed
    } catch (Exception e) {
        throw new TikaException(e.getMessage());
    }
    return buffer.toByteArray();
}
Also used : TikaException(org.apache.tika.exception.TikaException) ChmLzxBlock(org.apache.tika.parser.chm.lzx.ChmLzxBlock) ChmBlockInfo(org.apache.tika.parser.chm.lzx.ChmBlockInfo) ByteArrayOutputStream(java.io.ByteArrayOutputStream) IOException(java.io.IOException) TikaException(org.apache.tika.exception.TikaException)

Aggregations

ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 IOException (java.io.IOException)1 TikaException (org.apache.tika.exception.TikaException)1 ChmBlockInfo (org.apache.tika.parser.chm.lzx.ChmBlockInfo)1 ChmLzxBlock (org.apache.tika.parser.chm.lzx.ChmLzxBlock)1