use of org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet in project tika by apache.
the class TestChmBlockInfo method setUp.
@Before
public void setUp() throws Exception {
data = TestParameters.chmData;
/* Creates and parses itsf header */
ChmItsfHeader chmItsHeader = new ChmItsfHeader();
// chmItsHeader.parse(Arrays.copyOfRange(data, 0,
// ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
chmItsHeader.parse(ChmCommons.copyOfRange(data, 0, ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
/* Creates and parses itsp block */
ChmItspHeader chmItspHeader = new ChmItspHeader();
// chmItspHeader.parse(Arrays.copyOfRange( data, (int)
// chmItsHeader.getDirOffset(),
// (int) chmItsHeader.getDirOffset()
// + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
chmItspHeader.parse(ChmCommons.copyOfRange(data, (int) chmItsHeader.getDirOffset(), (int) chmItsHeader.getDirOffset() + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
/* Creating instance of ChmDirListingContainer */
chmDirListCont = new ChmDirectoryListingSet(data, chmItsHeader, chmItspHeader);
int indexOfControlData = chmDirListCont.getControlDataIndex();
int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data, ChmConstants.LZXC.getBytes(UTF_8));
byte[] dir_chunk = null;
if (indexOfResetTable > 0) {
// dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
// indexOfResetTable
// +
// chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable, indexOfResetTable + chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
}
/* Creates and parses control block */
chmLzxcControlData = new ChmLzxcControlData();
chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
int indexOfFeList = chmDirListCont.getResetTableIndex();
int startIndex = (int) chmDirListCont.getDataOffset() + chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getOffset();
// dir_chunk = Arrays.copyOfRange(data, startIndex , startIndex +
// chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength());
dir_chunk = ChmCommons.copyOfRange(data, startIndex, startIndex + chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength());
clrt = new ChmLzxcResetTable();
clrt.parse(dir_chunk, clrt);
}
use of org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet in project tika by apache.
the class TestChmExtraction method testExtractChmEntry.
protected void testExtractChmEntry(InputStream stream) throws TikaException, IOException {
ChmExtractor chmExtractor = new ChmExtractor(stream);
ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
final Pattern htmlPairP = Pattern.compile("\\Q<html\\E.+\\Q</html>\\E", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);
Set<String> names = new HashSet<String>();
for (DirectoryListingEntry directoryListingEntry : entries.getDirectoryListingEntryList()) {
byte[] data = chmExtractor.extractChmEntry(directoryListingEntry);
//Entry names should be nice. Disable this if the test chm do have bad looking but valid entry names.
if (!niceAscFileName(directoryListingEntry.getName())) {
throw new TikaException("Warning: File name contains a non ascii char : " + directoryListingEntry.getName());
}
final String lowName = directoryListingEntry.getName().toLowerCase(Locale.ROOT);
//check duplicate entry name which is seen before.
if (names.contains(lowName)) {
throw new TikaException("Duplicate File name detected : " + directoryListingEntry.getName());
}
names.add(lowName);
if (lowName.endsWith(".html") || lowName.endsWith(".htm") || lowName.endsWith(".hhk") || lowName.endsWith(".hhc")) //|| name.endsWith(".bmp")
{
if (findZero(data)) {
throw new TikaException("Xhtml/text file contains '\\0' : " + directoryListingEntry.getName());
}
//validate html
String html = new String(data, ISO_8859_1);
if (!htmlPairP.matcher(html).find()) {
System.err.println(lowName + " is invalid.");
System.err.println(html);
throw new TikaException("Invalid xhtml file : " + directoryListingEntry.getName());
}
// else {
// System.err.println(directoryListingEntry.getName() + " is valid.");
// }
}
}
}
use of org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet in project tika by apache.
the class TestChmLzxcControlData method setUp.
@Before
public void setUp() throws Exception {
byte[] data = TestParameters.chmData;
/* Creates and parses itsf header */
ChmItsfHeader chmItsHeader = new ChmItsfHeader();
// chmItsHeader.parse(Arrays.copyOfRange(data, 0,
// ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
chmItsHeader.parse(ChmCommons.copyOfRange(data, 0, ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
/* Creates and parses itsp block */
ChmItspHeader chmItspHeader = new ChmItspHeader();
// chmItspHeader.parse(Arrays.copyOfRange( data, (int)
// chmItsHeader.getDirOffset(),
// (int) chmItsHeader.getDirOffset()
// + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
chmItspHeader.parse(ChmCommons.copyOfRange(data, (int) chmItsHeader.getDirOffset(), (int) chmItsHeader.getDirOffset() + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
/* Creating instance of ChmDirListingContainer */
ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(data, chmItsHeader, chmItspHeader);
int indexOfControlData = chmDirListCont.getControlDataIndex();
int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data, ChmConstants.LZXC.getBytes(UTF_8));
byte[] dir_chunk = null;
if (indexOfResetTable > 0) {
// dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
// indexOfResetTable
// +
// chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable, indexOfResetTable + chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
}
/* Creates and parses control block */
chmLzxcControlData = new ChmLzxcControlData();
chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
}
use of org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet in project tika by apache.
the class TestChmLzxState method setUp.
@Before
public void setUp() throws Exception {
byte[] data = TestParameters.chmData;
/* Creates and parses itsf header */
ChmItsfHeader chmItsHeader = new ChmItsfHeader();
// chmItsHeader.parse(Arrays.copyOfRange(data, 0,
// ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
chmItsHeader.parse(ChmCommons.copyOfRange(data, 0, ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
/* Creates and parses itsp block */
ChmItspHeader chmItspHeader = new ChmItspHeader();
// chmItspHeader.parse(Arrays.copyOfRange( data, (int)
// chmItsHeader.getDirOffset(),
// (int) chmItsHeader.getDirOffset()
// + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
chmItspHeader.parse(ChmCommons.copyOfRange(data, (int) chmItsHeader.getDirOffset(), (int) chmItsHeader.getDirOffset() + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
/* Creating instance of ChmDirListingContainer */
ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(data, chmItsHeader, chmItspHeader);
int indexOfControlData = ChmCommons.indexOf(chmDirListCont.getDirectoryListingEntryList(), ChmConstants.CONTROL_DATA);
int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data, ChmConstants.LZXC.getBytes(UTF_8));
byte[] dir_chunk = null;
if (indexOfResetTable > 0) {
// dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
// indexOfResetTable
// +
// chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable, indexOfResetTable + chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
}
ChmLzxcControlData clcd = new ChmLzxcControlData();
clcd.parse(dir_chunk, clcd);
windowSize = (int) clcd.getWindowSize();
}
use of org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet in project tika by apache.
the class TestChmLzxcResetTable method setUp.
@Before
public void setUp() throws Exception {
byte[] data = TestParameters.chmData;
/* Creates and parses itsf header */
ChmItsfHeader chmItsfHeader = new ChmItsfHeader();
// chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
// ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0, ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
/* Creates and parses itsp block */
ChmItspHeader chmItspHeader = new ChmItspHeader();
// chmItspHeader.parse(Arrays.copyOfRange( data, (int)
// chmItsfHeader.getDirOffset(),
// (int) chmItsfHeader.getDirOffset()
// + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
chmItspHeader.parse(ChmCommons.copyOfRange(data, (int) chmItsfHeader.getDirOffset(), (int) chmItsfHeader.getDirOffset() + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
/* Creating instance of ChmDirListingContainer */
ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(data, chmItsfHeader, chmItspHeader);
int indexOfControlData = chmDirListCont.getControlDataIndex();
int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data, ChmConstants.LZXC.getBytes(UTF_8));
byte[] dir_chunk = null;
if (indexOfResetTable > 0) {
// dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
// indexOfResetTable
// +
// chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable, indexOfResetTable + chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
}
/* Creates and parses control block */
ChmLzxcControlData chmLzxcControlData = new ChmLzxcControlData();
chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
indexOfResetTable = chmDirListCont.getResetTableIndex();
chmLzxcResetTable = new ChmLzxcResetTable();
int startIndex = (int) chmDirListCont.getDataOffset() + chmDirListCont.getDirectoryListingEntryList().get(indexOfResetTable).getOffset();
ChmAssert.assertCopyingDataIndex(startIndex, data.length);
// dir_chunk = Arrays.copyOfRange(data, startIndex, startIndex
// +
// chmDirListCont.getDirectoryListingEntryList().get(indexOfResetTable).getLength());
dir_chunk = ChmCommons.copyOfRange(data, startIndex, startIndex + chmDirListCont.getDirectoryListingEntryList().get(indexOfResetTable).getLength());
chmLzxcResetTable.parse(dir_chunk, chmLzxcResetTable);
}
Aggregations