Search in sources :

Example 1 with ChmDirectoryListingSet

use of org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet in project tika by apache.

the class TestChmBlockInfo method setUp.

@Before
public void setUp() throws Exception {
    data = TestParameters.chmData;
    /* Creates and parses itsf header */
    ChmItsfHeader chmItsHeader = new ChmItsfHeader();
    // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
    // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
    chmItsHeader.parse(ChmCommons.copyOfRange(data, 0, ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
    /* Creates and parses itsp block */
    ChmItspHeader chmItspHeader = new ChmItspHeader();
    // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
    // chmItsHeader.getDirOffset(),
    // (int) chmItsHeader.getDirOffset()
    // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
    chmItspHeader.parse(ChmCommons.copyOfRange(data, (int) chmItsHeader.getDirOffset(), (int) chmItsHeader.getDirOffset() + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
    /* Creating instance of ChmDirListingContainer */
    chmDirListCont = new ChmDirectoryListingSet(data, chmItsHeader, chmItspHeader);
    int indexOfControlData = chmDirListCont.getControlDataIndex();
    int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data, ChmConstants.LZXC.getBytes(UTF_8));
    byte[] dir_chunk = null;
    if (indexOfResetTable > 0) {
        // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
        // indexOfResetTable
        // +
        // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
        dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable, indexOfResetTable + chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
    }
    /* Creates and parses control block */
    chmLzxcControlData = new ChmLzxcControlData();
    chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
    int indexOfFeList = chmDirListCont.getResetTableIndex();
    int startIndex = (int) chmDirListCont.getDataOffset() + chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getOffset();
    // dir_chunk = Arrays.copyOfRange(data, startIndex , startIndex +
    // chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength());
    dir_chunk = ChmCommons.copyOfRange(data, startIndex, startIndex + chmDirListCont.getDirectoryListingEntryList().get(indexOfFeList).getLength());
    clrt = new ChmLzxcResetTable();
    clrt.parse(dir_chunk, clrt);
}
Also used : ChmLzxcControlData(org.apache.tika.parser.chm.accessor.ChmLzxcControlData) ChmItsfHeader(org.apache.tika.parser.chm.accessor.ChmItsfHeader) ChmDirectoryListingSet(org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet) ChmLzxcResetTable(org.apache.tika.parser.chm.accessor.ChmLzxcResetTable) ChmItspHeader(org.apache.tika.parser.chm.accessor.ChmItspHeader) Before(org.junit.Before)

Example 2 with ChmDirectoryListingSet

use of org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet in project tika by apache.

the class TestChmExtraction method testExtractChmEntry.

protected void testExtractChmEntry(InputStream stream) throws TikaException, IOException {
    ChmExtractor chmExtractor = new ChmExtractor(stream);
    ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
    final Pattern htmlPairP = Pattern.compile("\\Q<html\\E.+\\Q</html>\\E", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL);
    Set<String> names = new HashSet<String>();
    for (DirectoryListingEntry directoryListingEntry : entries.getDirectoryListingEntryList()) {
        byte[] data = chmExtractor.extractChmEntry(directoryListingEntry);
        //Entry names should be nice. Disable this if the test chm do have bad looking but valid entry names.
        if (!niceAscFileName(directoryListingEntry.getName())) {
            throw new TikaException("Warning: File name contains a non ascii char : " + directoryListingEntry.getName());
        }
        final String lowName = directoryListingEntry.getName().toLowerCase(Locale.ROOT);
        //check duplicate entry name which is seen before.
        if (names.contains(lowName)) {
            throw new TikaException("Duplicate File name detected : " + directoryListingEntry.getName());
        }
        names.add(lowName);
        if (lowName.endsWith(".html") || lowName.endsWith(".htm") || lowName.endsWith(".hhk") || lowName.endsWith(".hhc")) //|| name.endsWith(".bmp")
        {
            if (findZero(data)) {
                throw new TikaException("Xhtml/text file contains '\\0' : " + directoryListingEntry.getName());
            }
            //validate html
            String html = new String(data, ISO_8859_1);
            if (!htmlPairP.matcher(html).find()) {
                System.err.println(lowName + " is invalid.");
                System.err.println(html);
                throw new TikaException("Invalid xhtml file : " + directoryListingEntry.getName());
            }
        //                else {
        //                    System.err.println(directoryListingEntry.getName() + " is valid.");
        //                }
        }
    }
}
Also used : Pattern(java.util.regex.Pattern) TikaException(org.apache.tika.exception.TikaException) ChmDirectoryListingSet(org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet) ChmExtractor(org.apache.tika.parser.chm.core.ChmExtractor) HashSet(java.util.HashSet) DirectoryListingEntry(org.apache.tika.parser.chm.accessor.DirectoryListingEntry)

Example 3 with ChmDirectoryListingSet

use of org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet in project tika by apache.

the class TestChmLzxcControlData method setUp.

@Before
public void setUp() throws Exception {
    byte[] data = TestParameters.chmData;
    /* Creates and parses itsf header */
    ChmItsfHeader chmItsHeader = new ChmItsfHeader();
    // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
    // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
    chmItsHeader.parse(ChmCommons.copyOfRange(data, 0, ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
    /* Creates and parses itsp block */
    ChmItspHeader chmItspHeader = new ChmItspHeader();
    // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
    // chmItsHeader.getDirOffset(),
    // (int) chmItsHeader.getDirOffset()
    // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
    chmItspHeader.parse(ChmCommons.copyOfRange(data, (int) chmItsHeader.getDirOffset(), (int) chmItsHeader.getDirOffset() + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
    /* Creating instance of ChmDirListingContainer */
    ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(data, chmItsHeader, chmItspHeader);
    int indexOfControlData = chmDirListCont.getControlDataIndex();
    int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data, ChmConstants.LZXC.getBytes(UTF_8));
    byte[] dir_chunk = null;
    if (indexOfResetTable > 0) {
        // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
        // indexOfResetTable
        // +
        // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
        dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable, indexOfResetTable + chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
    }
    /* Creates and parses control block */
    chmLzxcControlData = new ChmLzxcControlData();
    chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
}
Also used : ChmLzxcControlData(org.apache.tika.parser.chm.accessor.ChmLzxcControlData) ChmItsfHeader(org.apache.tika.parser.chm.accessor.ChmItsfHeader) ChmDirectoryListingSet(org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet) ChmItspHeader(org.apache.tika.parser.chm.accessor.ChmItspHeader) Before(org.junit.Before)

Example 4 with ChmDirectoryListingSet

use of org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet in project tika by apache.

the class TestChmLzxState method setUp.

@Before
public void setUp() throws Exception {
    byte[] data = TestParameters.chmData;
    /* Creates and parses itsf header */
    ChmItsfHeader chmItsHeader = new ChmItsfHeader();
    // chmItsHeader.parse(Arrays.copyOfRange(data, 0,
    // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
    chmItsHeader.parse(ChmCommons.copyOfRange(data, 0, ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsHeader);
    /* Creates and parses itsp block */
    ChmItspHeader chmItspHeader = new ChmItspHeader();
    // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
    // chmItsHeader.getDirOffset(),
    // (int) chmItsHeader.getDirOffset()
    // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
    chmItspHeader.parse(ChmCommons.copyOfRange(data, (int) chmItsHeader.getDirOffset(), (int) chmItsHeader.getDirOffset() + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
    /* Creating instance of ChmDirListingContainer */
    ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(data, chmItsHeader, chmItspHeader);
    int indexOfControlData = ChmCommons.indexOf(chmDirListCont.getDirectoryListingEntryList(), ChmConstants.CONTROL_DATA);
    int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data, ChmConstants.LZXC.getBytes(UTF_8));
    byte[] dir_chunk = null;
    if (indexOfResetTable > 0) {
        // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
        // indexOfResetTable
        // +
        // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
        dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable, indexOfResetTable + chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
    }
    ChmLzxcControlData clcd = new ChmLzxcControlData();
    clcd.parse(dir_chunk, clcd);
    windowSize = (int) clcd.getWindowSize();
}
Also used : ChmLzxcControlData(org.apache.tika.parser.chm.accessor.ChmLzxcControlData) ChmItsfHeader(org.apache.tika.parser.chm.accessor.ChmItsfHeader) ChmDirectoryListingSet(org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet) ChmItspHeader(org.apache.tika.parser.chm.accessor.ChmItspHeader) Before(org.junit.Before)

Example 5 with ChmDirectoryListingSet

use of org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet in project tika by apache.

the class TestChmLzxcResetTable method setUp.

@Before
public void setUp() throws Exception {
    byte[] data = TestParameters.chmData;
    /* Creates and parses itsf header */
    ChmItsfHeader chmItsfHeader = new ChmItsfHeader();
    // chmItsfHeader.parse(Arrays.copyOfRange(data, 0,
    // ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
    chmItsfHeader.parse(ChmCommons.copyOfRange(data, 0, ChmConstants.CHM_ITSF_V3_LEN - 1), chmItsfHeader);
    /* Creates and parses itsp block */
    ChmItspHeader chmItspHeader = new ChmItspHeader();
    // chmItspHeader.parse(Arrays.copyOfRange( data, (int)
    // chmItsfHeader.getDirOffset(),
    // (int) chmItsfHeader.getDirOffset()
    // + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
    chmItspHeader.parse(ChmCommons.copyOfRange(data, (int) chmItsfHeader.getDirOffset(), (int) chmItsfHeader.getDirOffset() + ChmConstants.CHM_ITSP_V1_LEN), chmItspHeader);
    /* Creating instance of ChmDirListingContainer */
    ChmDirectoryListingSet chmDirListCont = new ChmDirectoryListingSet(data, chmItsfHeader, chmItspHeader);
    int indexOfControlData = chmDirListCont.getControlDataIndex();
    int indexOfResetTable = ChmCommons.indexOfResetTableBlock(data, ChmConstants.LZXC.getBytes(UTF_8));
    byte[] dir_chunk = null;
    if (indexOfResetTable > 0) {
        // dir_chunk = Arrays.copyOfRange( data, indexOfResetTable,
        // indexOfResetTable
        // +
        // chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
        dir_chunk = ChmCommons.copyOfRange(data, indexOfResetTable, indexOfResetTable + chmDirListCont.getDirectoryListingEntryList().get(indexOfControlData).getLength());
    }
    /* Creates and parses control block */
    ChmLzxcControlData chmLzxcControlData = new ChmLzxcControlData();
    chmLzxcControlData.parse(dir_chunk, chmLzxcControlData);
    indexOfResetTable = chmDirListCont.getResetTableIndex();
    chmLzxcResetTable = new ChmLzxcResetTable();
    int startIndex = (int) chmDirListCont.getDataOffset() + chmDirListCont.getDirectoryListingEntryList().get(indexOfResetTable).getOffset();
    ChmAssert.assertCopyingDataIndex(startIndex, data.length);
    // dir_chunk = Arrays.copyOfRange(data, startIndex, startIndex
    // +
    // chmDirListCont.getDirectoryListingEntryList().get(indexOfResetTable).getLength());
    dir_chunk = ChmCommons.copyOfRange(data, startIndex, startIndex + chmDirListCont.getDirectoryListingEntryList().get(indexOfResetTable).getLength());
    chmLzxcResetTable.parse(dir_chunk, chmLzxcResetTable);
}
Also used : ChmLzxcControlData(org.apache.tika.parser.chm.accessor.ChmLzxcControlData) ChmItsfHeader(org.apache.tika.parser.chm.accessor.ChmItsfHeader) ChmDirectoryListingSet(org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet) ChmLzxcResetTable(org.apache.tika.parser.chm.accessor.ChmLzxcResetTable) ChmItspHeader(org.apache.tika.parser.chm.accessor.ChmItspHeader) Before(org.junit.Before)

Aggregations

ChmDirectoryListingSet (org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet)6 ChmItsfHeader (org.apache.tika.parser.chm.accessor.ChmItsfHeader)4 ChmItspHeader (org.apache.tika.parser.chm.accessor.ChmItspHeader)4 ChmLzxcControlData (org.apache.tika.parser.chm.accessor.ChmLzxcControlData)4 Before (org.junit.Before)4 ChmLzxcResetTable (org.apache.tika.parser.chm.accessor.ChmLzxcResetTable)2 DirectoryListingEntry (org.apache.tika.parser.chm.accessor.DirectoryListingEntry)2 HashSet (java.util.HashSet)1 Pattern (java.util.regex.Pattern)1 TikaTest (org.apache.tika.TikaTest)1 TikaException (org.apache.tika.exception.TikaException)1 ChmExtractor (org.apache.tika.parser.chm.core.ChmExtractor)1 Test (org.junit.Test)1