Search in sources :

Example 1 with CarbonIndexFileReader

use of org.apache.carbondata.core.reader.CarbonIndexFileReader in project carbondata by apache.

the class StreamSegment method size.

/**
 * calculate the size of the segment by the accumulation of data sizes in index file
 */
public static long size(String segmentDir) throws IOException {
    long size = 0;
    FileFactory.FileType fileType = FileFactory.getFileType(segmentDir);
    if (FileFactory.isFileExist(segmentDir, fileType)) {
        String indexPath = CarbonTablePath.getCarbonStreamIndexFilePath(segmentDir);
        CarbonFile index = FileFactory.getCarbonFile(indexPath, fileType);
        if (index.exists()) {
            CarbonIndexFileReader indexReader = new CarbonIndexFileReader();
            try {
                indexReader.openThriftReader(indexPath);
                while (indexReader.hasNext()) {
                    BlockIndex blockIndex = indexReader.readBlockIndexInfo();
                    size += blockIndex.getFile_size();
                }
            } finally {
                indexReader.closeThriftReader();
            }
        }
    }
    return size;
}
Also used : CarbonFile(org.apache.carbondata.core.datastore.filesystem.CarbonFile) CarbonIndexFileReader(org.apache.carbondata.core.reader.CarbonIndexFileReader) BlockIndex(org.apache.carbondata.format.BlockIndex) FileFactory(org.apache.carbondata.core.datastore.impl.FileFactory)

Example 2 with CarbonIndexFileReader

use of org.apache.carbondata.core.reader.CarbonIndexFileReader in project carbondata by apache.

the class StreamSegment method recoverSegmentIfRequired.

/**
 * check the health of stream segment and try to recover segment from job fault
 * this method will be invoked in following scenarios.
 * 1. at the begin of the streaming (StreamSinkFactory.getStreamSegmentId)
 * 2. after job failed (CarbonAppendableStreamSink.writeDataFileJob)
 */
public static void recoverSegmentIfRequired(String segmentDir) throws IOException {
    FileFactory.FileType fileType = FileFactory.getFileType(segmentDir);
    if (FileFactory.isFileExist(segmentDir, fileType)) {
        String indexName = CarbonTablePath.getCarbonStreamIndexFileName();
        String indexPath = segmentDir + File.separator + indexName;
        CarbonFile index = FileFactory.getCarbonFile(indexPath, fileType);
        CarbonFile[] files = listDataFiles(segmentDir, fileType);
        // index file exists
        if (index.exists()) {
            // data file exists
            if (files.length > 0) {
                CarbonIndexFileReader indexReader = new CarbonIndexFileReader();
                try {
                    // map block index
                    indexReader.openThriftReader(indexPath);
                    Map<String, Long> tableSizeMap = new HashMap<>();
                    while (indexReader.hasNext()) {
                        BlockIndex blockIndex = indexReader.readBlockIndexInfo();
                        tableSizeMap.put(blockIndex.getFile_name(), blockIndex.getFile_size());
                    }
                    // recover each file
                    for (CarbonFile file : files) {
                        Long size = tableSizeMap.get(file.getName());
                        if (null == size || size == 0) {
                            file.delete();
                        } else if (size < file.getSize()) {
                            FileFactory.truncateFile(file.getCanonicalPath(), fileType, size);
                        }
                    }
                } finally {
                    indexReader.closeThriftReader();
                }
            }
        } else {
            if (files.length > 0) {
                for (CarbonFile file : files) {
                    file.delete();
                }
            }
        }
    }
}
Also used : CarbonFile(org.apache.carbondata.core.datastore.filesystem.CarbonFile) HashMap(java.util.HashMap) CarbonIndexFileReader(org.apache.carbondata.core.reader.CarbonIndexFileReader) BlockIndex(org.apache.carbondata.format.BlockIndex) FileFactory(org.apache.carbondata.core.datastore.impl.FileFactory)

Example 3 with CarbonIndexFileReader

use of org.apache.carbondata.core.reader.CarbonIndexFileReader in project carbondata by apache.

the class StreamSegment method recoverFileIfRequired.

/**
 * check the health of stream data file and try to recover data file from task fault
 *  this method will be invoked in following scenarios.
 *  1. at the begin of writing data file task
 */
public static void recoverFileIfRequired(String segmentDir, String fileName, String indexName) throws IOException {
    FileFactory.FileType fileType = FileFactory.getFileType(segmentDir);
    String filePath = segmentDir + File.separator + fileName;
    CarbonFile file = FileFactory.getCarbonFile(filePath, fileType);
    String indexPath = segmentDir + File.separator + indexName;
    CarbonFile index = FileFactory.getCarbonFile(indexPath, fileType);
    if (file.exists() && index.exists()) {
        CarbonIndexFileReader indexReader = new CarbonIndexFileReader();
        try {
            indexReader.openThriftReader(indexPath);
            while (indexReader.hasNext()) {
                BlockIndex blockIndex = indexReader.readBlockIndexInfo();
                if (blockIndex.getFile_name().equals(fileName)) {
                    if (blockIndex.getFile_size() == 0) {
                        file.delete();
                    } else if (blockIndex.getFile_size() < file.getSize()) {
                        FileFactory.truncateFile(filePath, fileType, blockIndex.getFile_size());
                    }
                }
            }
        } finally {
            indexReader.closeThriftReader();
        }
    }
}
Also used : CarbonFile(org.apache.carbondata.core.datastore.filesystem.CarbonFile) CarbonIndexFileReader(org.apache.carbondata.core.reader.CarbonIndexFileReader) BlockIndex(org.apache.carbondata.format.BlockIndex) FileFactory(org.apache.carbondata.core.datastore.impl.FileFactory)

Example 4 with CarbonIndexFileReader

use of org.apache.carbondata.core.reader.CarbonIndexFileReader in project carbondata by apache.

the class CarbonTableInputFormat method getSplitsOfStreaming.

/**
 * use file list in .carbonindex file to get the split of streaming.
 */
public List<InputSplit> getSplitsOfStreaming(JobContext job, AbsoluteTableIdentifier identifier, List<Segment> streamSegments) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    if (streamSegments != null && !streamSegments.isEmpty()) {
        numStreamSegments = streamSegments.size();
        long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
        long maxSize = getMaxSplitSize(job);
        for (Segment segment : streamSegments) {
            String segmentDir = CarbonTablePath.getSegmentPath(identifier.getTablePath(), segment.getSegmentNo());
            FileFactory.FileType fileType = FileFactory.getFileType(segmentDir);
            if (FileFactory.isFileExist(segmentDir, fileType)) {
                String indexName = CarbonTablePath.getCarbonStreamIndexFileName();
                String indexPath = segmentDir + File.separator + indexName;
                CarbonFile index = FileFactory.getCarbonFile(indexPath, fileType);
                // index file exists
                if (index.exists()) {
                    // data file exists
                    CarbonIndexFileReader indexReader = new CarbonIndexFileReader();
                    try {
                        // map block index
                        indexReader.openThriftReader(indexPath);
                        while (indexReader.hasNext()) {
                            BlockIndex blockIndex = indexReader.readBlockIndexInfo();
                            String filePath = segmentDir + File.separator + blockIndex.getFile_name();
                            Path path = new Path(filePath);
                            long length = blockIndex.getFile_size();
                            if (length != 0) {
                                BlockLocation[] blkLocations;
                                FileSystem fs = FileFactory.getFileSystem(path);
                                FileStatus file = fs.getFileStatus(path);
                                blkLocations = fs.getFileBlockLocations(path, 0, length);
                                long blockSize = file.getBlockSize();
                                long splitSize = computeSplitSize(blockSize, minSize, maxSize);
                                long bytesRemaining = length;
                                while (((double) bytesRemaining) / splitSize > 1.1) {
                                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                                    splits.add(makeSplit(segment.getSegmentNo(), path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts(), FileFormat.ROW_V1));
                                    bytesRemaining -= splitSize;
                                }
                                if (bytesRemaining != 0) {
                                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                                    splits.add(makeSplit(segment.getSegmentNo(), path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts(), FileFormat.ROW_V1));
                                }
                            } else {
                                // Create empty hosts array for zero length files
                                splits.add(makeSplit(segment.getSegmentNo(), path, 0, length, new String[0], FileFormat.ROW_V1));
                            }
                        }
                    } finally {
                        indexReader.closeThriftReader();
                    }
                }
            }
        }
    }
    return splits;
}
Also used : Path(org.apache.hadoop.fs.Path) CarbonTablePath(org.apache.carbondata.core.util.path.CarbonTablePath) CarbonFile(org.apache.carbondata.core.datastore.filesystem.CarbonFile) FileStatus(org.apache.hadoop.fs.FileStatus) CarbonIndexFileReader(org.apache.carbondata.core.reader.CarbonIndexFileReader) ArrayList(java.util.ArrayList) BlockLocation(org.apache.hadoop.fs.BlockLocation) BlockIndex(org.apache.carbondata.format.BlockIndex) Segment(org.apache.carbondata.core.datamap.Segment) FileFactory(org.apache.carbondata.core.datastore.impl.FileFactory) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit)

Example 5 with CarbonIndexFileReader

use of org.apache.carbondata.core.reader.CarbonIndexFileReader in project carbondata by apache.

the class AbstractDataFileFooterConverter method getIndexInfo.

/**
 * Below method will be used to get the index info from index file
 *
 * @param filePath           file path of the index file
 * @return list of index info
 * @throws IOException problem while reading the index file
 */
public List<DataFileFooter> getIndexInfo(String filePath, byte[] fileData) throws IOException {
    CarbonIndexFileReader indexReader = new CarbonIndexFileReader();
    List<DataFileFooter> dataFileFooters = new ArrayList<DataFileFooter>();
    String parentPath = filePath.substring(0, filePath.lastIndexOf("/"));
    try {
        // open the reader
        if (fileData != null) {
            indexReader.openThriftReader(fileData);
        } else {
            indexReader.openThriftReader(filePath);
        }
        // get the index header
        org.apache.carbondata.format.IndexHeader readIndexHeader = indexReader.readIndexHeader();
        List<ColumnSchema> columnSchemaList = new ArrayList<ColumnSchema>();
        List<org.apache.carbondata.format.ColumnSchema> table_columns = readIndexHeader.getTable_columns();
        for (int i = 0; i < table_columns.size(); i++) {
            columnSchemaList.add(thriftColumnSchmeaToWrapperColumnSchema(table_columns.get(i)));
        }
        // get the segment info
        SegmentInfo segmentInfo = getSegmentInfo(readIndexHeader.getSegment_info());
        BlockletIndex blockletIndex = null;
        DataFileFooter dataFileFooter = null;
        // read the block info from file
        while (indexReader.hasNext()) {
            BlockIndex readBlockIndexInfo = indexReader.readBlockIndexInfo();
            blockletIndex = getBlockletIndex(readBlockIndexInfo.getBlock_index());
            dataFileFooter = new DataFileFooter();
            TableBlockInfo tableBlockInfo = getTableBlockInfo(readBlockIndexInfo, readIndexHeader, parentPath);
            dataFileFooter.setBlockletIndex(blockletIndex);
            dataFileFooter.setColumnInTable(columnSchemaList);
            dataFileFooter.setNumberOfRows(readBlockIndexInfo.getNum_rows());
            dataFileFooter.setBlockInfo(new BlockInfo(tableBlockInfo));
            dataFileFooter.setSegmentInfo(segmentInfo);
            dataFileFooter.setVersionId(tableBlockInfo.getVersion());
            // In case of old schema time stamp will not be found in the index header
            if (readIndexHeader.isSetSchema_time_stamp()) {
                dataFileFooter.setSchemaUpdatedTimeStamp(readIndexHeader.getSchema_time_stamp());
            }
            if (readBlockIndexInfo.isSetBlocklet_info()) {
                List<BlockletInfo> blockletInfoList = new ArrayList<BlockletInfo>();
                BlockletInfo blockletInfo = new DataFileFooterConverterV3().getBlockletInfo(readBlockIndexInfo.getBlocklet_info(), CarbonUtil.getNumberOfDimensionColumns(columnSchemaList));
                blockletInfo.setBlockletIndex(blockletIndex);
                blockletInfoList.add(blockletInfo);
                dataFileFooter.setBlockletList(blockletInfoList);
            }
            dataFileFooters.add(dataFileFooter);
        }
    } finally {
        indexReader.closeThriftReader();
    }
    return dataFileFooters;
}
Also used : TableBlockInfo(org.apache.carbondata.core.datastore.block.TableBlockInfo) CarbonIndexFileReader(org.apache.carbondata.core.reader.CarbonIndexFileReader) BlockletIndex(org.apache.carbondata.core.metadata.blocklet.index.BlockletIndex) ArrayList(java.util.ArrayList) BlockletInfo(org.apache.carbondata.core.metadata.blocklet.BlockletInfo) ColumnSchema(org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema) BlockIndex(org.apache.carbondata.format.BlockIndex) DataFileFooter(org.apache.carbondata.core.metadata.blocklet.DataFileFooter) BlockInfo(org.apache.carbondata.core.datastore.block.BlockInfo) TableBlockInfo(org.apache.carbondata.core.datastore.block.TableBlockInfo) SegmentInfo(org.apache.carbondata.core.metadata.blocklet.SegmentInfo)

Aggregations

CarbonIndexFileReader (org.apache.carbondata.core.reader.CarbonIndexFileReader)8 BlockIndex (org.apache.carbondata.format.BlockIndex)7 ArrayList (java.util.ArrayList)4 CarbonFile (org.apache.carbondata.core.datastore.filesystem.CarbonFile)4 FileFactory (org.apache.carbondata.core.datastore.impl.FileFactory)4 TableBlockInfo (org.apache.carbondata.core.datastore.block.TableBlockInfo)3 BlockInfo (org.apache.carbondata.core.datastore.block.BlockInfo)2 BlockletInfo (org.apache.carbondata.core.metadata.blocklet.BlockletInfo)2 DataFileFooter (org.apache.carbondata.core.metadata.blocklet.DataFileFooter)2 SegmentInfo (org.apache.carbondata.core.metadata.blocklet.SegmentInfo)2 BlockletIndex (org.apache.carbondata.core.metadata.blocklet.index.BlockletIndex)2 ColumnSchema (org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema)2 ByteBuffer (java.nio.ByteBuffer)1 HashMap (java.util.HashMap)1 MockUp (mockit.MockUp)1 Segment (org.apache.carbondata.core.datamap.Segment)1 DataFileFooterConverter (org.apache.carbondata.core.util.DataFileFooterConverter)1 CarbonTablePath (org.apache.carbondata.core.util.path.CarbonTablePath)1 MergedBlockIndex (org.apache.carbondata.format.MergedBlockIndex)1 CarbonInputSplit (org.apache.carbondata.hadoop.CarbonInputSplit)1