use of org.apache.tika.parser.chm.lzx.ChmBlockInfo in project tika by apache.
the class ChmExtractor method extractChmEntry.
/**
* Decompresses a chm entry
*
* @param directoryListingEntry
*
* @return decompressed data
* @throws TikaException
*/
public byte[] extractChmEntry(DirectoryListingEntry directoryListingEntry) throws TikaException {
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
ChmLzxBlock lzxBlock = null;
try {
/* UNCOMPRESSED type is easiest one */
if (directoryListingEntry.getEntryType() == EntryType.UNCOMPRESSED && directoryListingEntry.getLength() > 0 && !ChmCommons.hasSkip(directoryListingEntry)) {
int dataOffset = (int) (getChmItsfHeader().getDataOffset() + directoryListingEntry.getOffset());
// dataSegment = Arrays.copyOfRange(getData(), dataOffset,
// dataOffset + directoryListingEntry.getLength());
buffer.write(ChmCommons.copyOfRange(getData(), dataOffset, dataOffset + directoryListingEntry.getLength()));
} else if (directoryListingEntry.getEntryType() == EntryType.COMPRESSED && !ChmCommons.hasSkip(directoryListingEntry)) {
/* Gets a chm hit_cache info */
ChmBlockInfo bb = ChmBlockInfo.getChmBlockInfoInstance(directoryListingEntry, (int) getChmLzxcResetTable().getBlockLen(), getChmLzxcControlData());
int i = 0, start = 0, hit_cache = 0;
if ((getLzxBlockLength() < Integer.MAX_VALUE) && (getLzxBlockOffset() < Integer.MAX_VALUE)) {
// TODO: Improve the caching
// caching ... = O(n^2) - depends on startBlock and endBlock
start = -1;
if (!getLzxBlocksCache().isEmpty()) {
for (i = 0; i < getLzxBlocksCache().size(); i++) {
//lzxBlock = getLzxBlocksCache().get(i);
int bn = getLzxBlocksCache().get(i).getBlockNumber();
for (int j = bb.getIniBlock(); j <= bb.getStartBlock(); j++) {
if (bn == j) {
if (j > start) {
start = j;
hit_cache = i;
}
}
}
if (start == bb.getStartBlock())
break;
}
}
// if (i == getLzxBlocksCache().size() && i == 0) {
if (start < 0) {
start = bb.getIniBlock();
byte[] dataSegment = ChmCommons.getChmBlockSegment(getData(), getChmLzxcResetTable(), start, (int) getLzxBlockOffset(), (int) getLzxBlockLength());
lzxBlock = new ChmLzxBlock(start, dataSegment, getChmLzxcResetTable().getBlockLen(), null);
getLzxBlocksCache().add(lzxBlock);
} else {
lzxBlock = getLzxBlocksCache().get(hit_cache);
}
for (i = start; i <= bb.getEndBlock(); ) {
if (i == bb.getStartBlock() && i == bb.getEndBlock()) {
buffer.write(lzxBlock.getContent(bb.getStartOffset(), bb.getEndOffset()));
break;
}
if (i == bb.getStartBlock()) {
buffer.write(lzxBlock.getContent(bb.getStartOffset()));
}
if (i > bb.getStartBlock() && i < bb.getEndBlock()) {
buffer.write(lzxBlock.getContent());
}
if (i == bb.getEndBlock()) {
buffer.write(lzxBlock.getContent(0, bb.getEndOffset()));
break;
}
i++;
if (i % getChmLzxcControlData().getResetInterval() == 0) {
lzxBlock = new ChmLzxBlock(i, ChmCommons.getChmBlockSegment(getData(), getChmLzxcResetTable(), i, (int) getLzxBlockOffset(), (int) getLzxBlockLength()), getChmLzxcResetTable().getBlockLen(), null);
} else {
lzxBlock = new ChmLzxBlock(i, ChmCommons.getChmBlockSegment(getData(), getChmLzxcResetTable(), i, (int) getLzxBlockOffset(), (int) getLzxBlockLength()), getChmLzxcResetTable().getBlockLen(), lzxBlock);
}
getLzxBlocksCache().add(lzxBlock);
}
if (getLzxBlocksCache().size() > getChmLzxcResetTable().getBlockCount()) {
getLzxBlocksCache().clear();
}
}
if (buffer.size() != directoryListingEntry.getLength()) {
throw new TikaException("CHM file extract error: extracted Length is wrong.");
}
}
//end of if compressed
} catch (Exception e) {
throw new TikaException(e.getMessage());
}
return buffer.toByteArray();
}
Aggregations