Search in sources :

Example 16 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonStreamRecordReader method initialize.

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    // input
    if (split instanceof CarbonInputSplit) {
        fileSplit = (CarbonInputSplit) split;
    } else if (split instanceof CarbonMultiBlockSplit) {
        fileSplit = ((CarbonMultiBlockSplit) split).getAllSplits().get(0);
    } else {
        fileSplit = (FileSplit) split;
    }
    // metadata
    hadoopConf = context.getConfiguration();
    if (model == null) {
        CarbonTableInputFormat format = new CarbonTableInputFormat<Object>();
        model = format.createQueryModel(split, context);
    }
    carbonTable = model.getTable();
    List<CarbonDimension> dimensions = carbonTable.getDimensionByTableName(carbonTable.getTableName());
    dimensionCount = dimensions.size();
    List<CarbonMeasure> measures = carbonTable.getMeasureByTableName(carbonTable.getTableName());
    measureCount = measures.size();
    List<CarbonColumn> carbonColumnList = carbonTable.getStreamStorageOrderColumn(carbonTable.getTableName());
    storageColumns = carbonColumnList.toArray(new CarbonColumn[carbonColumnList.size()]);
    isNoDictColumn = CarbonDataProcessorUtil.getNoDictionaryMapping(storageColumns);
    directDictionaryGenerators = new DirectDictionaryGenerator[storageColumns.length];
    for (int i = 0; i < storageColumns.length; i++) {
        if (storageColumns[i].hasEncoding(Encoding.DIRECT_DICTIONARY)) {
            directDictionaryGenerators[i] = DirectDictionaryKeyGeneratorFactory.getDirectDictionaryGenerator(storageColumns[i].getDataType());
        }
    }
    measureDataTypes = new DataType[measureCount];
    for (int i = 0; i < measureCount; i++) {
        measureDataTypes[i] = storageColumns[dimensionCount + i].getDataType();
    }
    // decode data
    allNonNull = new BitSet(storageColumns.length);
    projection = model.getProjectionColumns();
    isRequired = new boolean[storageColumns.length];
    boolean[] isFiltlerDimensions = model.getIsFilterDimensions();
    boolean[] isFiltlerMeasures = model.getIsFilterMeasures();
    isFilterRequired = new boolean[storageColumns.length];
    filterMap = new int[storageColumns.length];
    for (int i = 0; i < storageColumns.length; i++) {
        if (storageColumns[i].isDimension()) {
            if (isFiltlerDimensions[storageColumns[i].getOrdinal()]) {
                isRequired[i] = true;
                isFilterRequired[i] = true;
                filterMap[i] = storageColumns[i].getOrdinal();
            }
        } else {
            if (isFiltlerMeasures[storageColumns[i].getOrdinal()]) {
                isRequired[i] = true;
                isFilterRequired[i] = true;
                filterMap[i] = carbonTable.getDimensionOrdinalMax() + storageColumns[i].getOrdinal();
            }
        }
    }
    isProjectionRequired = new boolean[storageColumns.length];
    projectionMap = new int[storageColumns.length];
    for (int i = 0; i < storageColumns.length; i++) {
        for (int j = 0; j < projection.length; j++) {
            if (storageColumns[i].getColName().equals(projection[j].getColName())) {
                isRequired[i] = true;
                isProjectionRequired[i] = true;
                projectionMap[i] = j;
                break;
            }
        }
    }
    // initialize filter
    if (null != model.getFilterExpressionResolverTree()) {
        initializeFilter();
    } else if (projection.length == 0) {
        skipScanData = true;
    }
}
Also used : CarbonColumn(org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn) BitSet(java.util.BitSet) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) CarbonDimension(org.apache.carbondata.core.metadata.schema.table.column.CarbonDimension) CarbonMeasure(org.apache.carbondata.core.metadata.schema.table.column.CarbonMeasure) CarbonMultiBlockSplit(org.apache.carbondata.hadoop.CarbonMultiBlockSplit) CarbonTableInputFormat(org.apache.carbondata.hadoop.api.CarbonTableInputFormat)

Example 17 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonInputFormat method getDataBlocksOfSegment.

/**
 * get data blocks of given segment
 */
protected List<CarbonInputSplit> getDataBlocksOfSegment(JobContext job, CarbonTable carbonTable, FilterResolverIntf resolver, BitSet matchedPartitions, List<Segment> segmentIds, PartitionInfo partitionInfo, List<Integer> oldPartitionIdList) throws IOException {
    QueryStatisticsRecorder recorder = CarbonTimeStatisticsFactory.createDriverRecorder();
    QueryStatistic statistic = new QueryStatistic();
    // get tokens for all the required FileSystem for table path
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { new Path(carbonTable.getTablePath()) }, job.getConfiguration());
    boolean distributedCG = Boolean.parseBoolean(CarbonProperties.getInstance().getProperty(CarbonCommonConstants.USE_DISTRIBUTED_DATAMAP, CarbonCommonConstants.USE_DISTRIBUTED_DATAMAP_DEFAULT));
    DataMapExprWrapper dataMapExprWrapper = DataMapChooser.get().choose(getOrCreateCarbonTable(job.getConfiguration()), resolver);
    DataMapJob dataMapJob = getDataMapJob(job.getConfiguration());
    List<PartitionSpec> partitionsToPrune = getPartitionsToPrune(job.getConfiguration());
    List<ExtendedBlocklet> prunedBlocklets;
    if (distributedCG || dataMapExprWrapper.getDataMapType() == DataMapLevel.FG) {
        DistributableDataMapFormat datamapDstr = new DistributableDataMapFormat(carbonTable, dataMapExprWrapper, segmentIds, partitionsToPrune, BlockletDataMapFactory.class.getName());
        prunedBlocklets = dataMapJob.execute(datamapDstr, resolver);
        // Apply expression on the blocklets.
        prunedBlocklets = dataMapExprWrapper.pruneBlocklets(prunedBlocklets);
    } else {
        prunedBlocklets = dataMapExprWrapper.prune(segmentIds, partitionsToPrune);
    }
    List<CarbonInputSplit> resultFilterredBlocks = new ArrayList<>();
    int partitionIndex = 0;
    List<Integer> partitionIdList = new ArrayList<>();
    if (partitionInfo != null && partitionInfo.getPartitionType() != PartitionType.NATIVE_HIVE) {
        partitionIdList = partitionInfo.getPartitionIds();
    }
    for (ExtendedBlocklet blocklet : prunedBlocklets) {
        long partitionId = CarbonTablePath.DataFileUtil.getTaskIdFromTaskNo(CarbonTablePath.DataFileUtil.getTaskNo(blocklet.getPath()));
        // For other normal query should use newest partitionIdList
        if (partitionInfo != null && partitionInfo.getPartitionType() != PartitionType.NATIVE_HIVE) {
            if (oldPartitionIdList != null) {
                partitionIndex = oldPartitionIdList.indexOf((int) partitionId);
            } else {
                partitionIndex = partitionIdList.indexOf((int) partitionId);
            }
        }
        if (partitionIndex != -1) {
            // if this partition is not required, here will skip it.
            if (matchedPartitions == null || matchedPartitions.get(partitionIndex)) {
                CarbonInputSplit inputSplit = convertToCarbonInputSplit(blocklet);
                if (inputSplit != null) {
                    resultFilterredBlocks.add(inputSplit);
                }
            }
        }
    }
    statistic.addStatistics(QueryStatisticsConstants.LOAD_BLOCKS_DRIVER, System.currentTimeMillis());
    recorder.recordStatisticsForDriver(statistic, job.getConfiguration().get("query.id"));
    return resultFilterredBlocks;
}
Also used : Path(org.apache.hadoop.fs.Path) CarbonTablePath(org.apache.carbondata.core.util.path.CarbonTablePath) ArrayList(java.util.ArrayList) BlockletDataMapFactory(org.apache.carbondata.core.indexstore.blockletindex.BlockletDataMapFactory) DataMapExprWrapper(org.apache.carbondata.core.datamap.dev.expr.DataMapExprWrapper) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) PartitionSpec(org.apache.carbondata.core.indexstore.PartitionSpec) QueryStatisticsRecorder(org.apache.carbondata.core.stats.QueryStatisticsRecorder) ExtendedBlocklet(org.apache.carbondata.core.indexstore.ExtendedBlocklet) QueryStatistic(org.apache.carbondata.core.stats.QueryStatistic)

Example 18 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonInputFormat method convertToCarbonInputSplit.

private CarbonInputSplit convertToCarbonInputSplit(ExtendedBlocklet blocklet) throws IOException {
    CarbonInputSplit split = CarbonInputSplit.from(blocklet.getSegmentId(), blocklet.getBlockletId(), new FileSplit(new Path(blocklet.getPath()), 0, blocklet.getLength(), blocklet.getLocations()), ColumnarFormatVersion.valueOf((short) blocklet.getDetailInfo().getVersionNumber()), blocklet.getDataMapWriterPath());
    split.setDetailInfo(blocklet.getDetailInfo());
    return split;
}
Also used : Path(org.apache.hadoop.fs.Path) CarbonTablePath(org.apache.carbondata.core.util.path.CarbonTablePath) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit)

Example 19 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonTableInputFormat method getSplitsOfOneSegment.

/**
 * Read data in one segment. For alter table partition statement
 * @param job
 * @param targetSegment
 * @param oldPartitionIdList  get old partitionId before partitionInfo was changed
 * @return
 */
public List<InputSplit> getSplitsOfOneSegment(JobContext job, String targetSegment, List<Integer> oldPartitionIdList, PartitionInfo partitionInfo) {
    List<Segment> invalidSegments = new ArrayList<>();
    List<UpdateVO> invalidTimestampsList = new ArrayList<>();
    List<Segment> segmentList = new ArrayList<>();
    segmentList.add(new Segment(targetSegment, null));
    setSegmentsToAccess(job.getConfiguration(), segmentList);
    try {
        // process and resolve the expression
        Expression filter = getFilterPredicates(job.getConfiguration());
        CarbonTable carbonTable = getOrCreateCarbonTable(job.getConfiguration());
        // this will be null in case of corrupt schema file.
        if (null == carbonTable) {
            throw new IOException("Missing/Corrupt schema file for table.");
        }
        carbonTable.processFilterExpression(filter, null, null);
        TableProvider tableProvider = new SingleTableProvider(carbonTable);
        // prune partitions for filter query on partition table
        String partitionIds = job.getConfiguration().get(ALTER_PARTITION_ID);
        // matchedPartitions records partitionIndex, not partitionId
        BitSet matchedPartitions = null;
        if (partitionInfo != null) {
            matchedPartitions = setMatchedPartitions(partitionIds, filter, partitionInfo, oldPartitionIdList);
            if (matchedPartitions != null) {
                if (matchedPartitions.cardinality() == 0) {
                    return new ArrayList<InputSplit>();
                } else if (matchedPartitions.cardinality() == partitionInfo.getNumPartitions()) {
                    matchedPartitions = null;
                }
            }
        }
        FilterResolverIntf filterInterface = carbonTable.resolveFilter(filter, tableProvider);
        // do block filtering and get split
        List<InputSplit> splits = getSplits(job, filterInterface, segmentList, matchedPartitions, partitionInfo, oldPartitionIdList, new SegmentUpdateStatusManager(carbonTable));
        // pass the invalid segment to task side in order to remove index entry in task side
        if (invalidSegments.size() > 0) {
            for (InputSplit split : splits) {
                ((CarbonInputSplit) split).setInvalidSegments(invalidSegments);
                ((CarbonInputSplit) split).setInvalidTimestampRange(invalidTimestampsList);
            }
        }
        return splits;
    } catch (IOException e) {
        throw new RuntimeException("Can't get splits of the target segment ", e);
    }
}
Also used : SegmentUpdateStatusManager(org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager) ArrayList(java.util.ArrayList) BitSet(java.util.BitSet) IOException(java.io.IOException) TableProvider(org.apache.carbondata.core.scan.filter.TableProvider) SingleTableProvider(org.apache.carbondata.core.scan.filter.SingleTableProvider) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) UpdateVO(org.apache.carbondata.core.mutate.UpdateVO) Segment(org.apache.carbondata.core.datamap.Segment) CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) SingleTableProvider(org.apache.carbondata.core.scan.filter.SingleTableProvider) Expression(org.apache.carbondata.core.scan.expression.Expression) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) FilterResolverIntf(org.apache.carbondata.core.scan.filter.resolver.FilterResolverIntf)

Example 20 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class InMemoryBTreeIndex method getTableBlockInfo.

/**
 * Below method will be used to get the table block info
 *
 * @param job                     job context
 * @return list of table block
 * @throws IOException
 */
private List<TableBlockInfo> getTableBlockInfo(JobContext job) throws IOException {
    List<TableBlockInfo> tableBlockInfoList = new ArrayList<>();
    // identify table blocks from all file locations of given segment
    for (InputSplit inputSplit : segment.getAllSplits(job)) {
        CarbonInputSplit carbonInputSplit = (CarbonInputSplit) inputSplit;
        BlockletInfos blockletInfos = new BlockletInfos(carbonInputSplit.getNumberOfBlocklets(), 0, carbonInputSplit.getNumberOfBlocklets());
        tableBlockInfoList.add(new TableBlockInfo(carbonInputSplit.getPath().toString(), carbonInputSplit.getBlockletId(), carbonInputSplit.getStart(), segment.getId(), carbonInputSplit.getLocations(), carbonInputSplit.getLength(), blockletInfos, carbonInputSplit.getVersion(), carbonInputSplit.getDeleteDeltaFiles()));
    }
    return tableBlockInfoList;
}
Also used : TableBlockInfo(org.apache.carbondata.core.datastore.block.TableBlockInfo) BlockletInfos(org.apache.carbondata.core.datastore.block.BlockletInfos) ArrayList(java.util.ArrayList) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit)

Aggregations

CarbonInputSplit (org.apache.carbondata.hadoop.CarbonInputSplit)33 ArrayList (java.util.ArrayList)17 IOException (java.io.IOException)15 InputSplit (org.apache.hadoop.mapreduce.InputSplit)10 CarbonTable (org.apache.carbondata.core.metadata.schema.table.CarbonTable)8 LinkedList (java.util.LinkedList)6 CarbonMultiBlockSplit (org.apache.carbondata.hadoop.CarbonMultiBlockSplit)6 IndexFilter (org.apache.carbondata.core.index.IndexFilter)5 CarbonTablePath (org.apache.carbondata.core.util.path.CarbonTablePath)5 HashMap (java.util.HashMap)4 HashSet (java.util.HashSet)4 List (java.util.List)4 TableBlockInfo (org.apache.carbondata.core.datastore.block.TableBlockInfo)4 PartitionSpec (org.apache.carbondata.core.indexstore.PartitionSpec)4 LoadMetadataDetails (org.apache.carbondata.core.statusmanager.LoadMetadataDetails)4 SegmentUpdateStatusManager (org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager)4 CarbonTableInputFormat (org.apache.carbondata.hadoop.api.CarbonTableInputFormat)4 Configuration (org.apache.hadoop.conf.Configuration)4 Path (org.apache.hadoop.fs.Path)4 Gson (com.google.gson.Gson)3