Search in sources :

Example 21 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonTableReader method getInputSplits2.

public List<CarbonLocalInputSplit> getInputSplits2(CarbonTableCacheModel tableCacheModel, Expression filters) {
    List<CarbonLocalInputSplit> result = new ArrayList<>();
    if (config.getUnsafeMemoryInMb() != null) {
        CarbonProperties.getInstance().addProperty(CarbonCommonConstants.UNSAFE_WORKING_MEMORY_IN_MB, config.getUnsafeMemoryInMb());
    }
    CarbonTable carbonTable = tableCacheModel.carbonTable;
    TableInfo tableInfo = tableCacheModel.carbonTable.getTableInfo();
    Configuration config = new Configuration();
    config.set(CarbonTableInputFormat.INPUT_SEGMENT_NUMBERS, "");
    String carbonTablePath = carbonTable.getAbsoluteTableIdentifier().getTablePath();
    config.set(CarbonTableInputFormat.INPUT_DIR, carbonTablePath);
    config.set(CarbonTableInputFormat.DATABASE_NAME, carbonTable.getDatabaseName());
    config.set(CarbonTableInputFormat.TABLE_NAME, carbonTable.getTableName());
    try {
        CarbonTableInputFormat.setTableInfo(config, tableInfo);
        CarbonTableInputFormat carbonTableInputFormat = createInputFormat(config, carbonTable.getAbsoluteTableIdentifier(), filters);
        JobConf jobConf = new JobConf(config);
        Job job = Job.getInstance(jobConf);
        List<InputSplit> splits = carbonTableInputFormat.getSplits(job);
        CarbonInputSplit carbonInputSplit = null;
        Gson gson = new Gson();
        if (splits != null && splits.size() > 0) {
            for (InputSplit inputSplit : splits) {
                carbonInputSplit = (CarbonInputSplit) inputSplit;
                result.add(new CarbonLocalInputSplit(carbonInputSplit.getSegmentId(), carbonInputSplit.getPath().toString(), carbonInputSplit.getStart(), carbonInputSplit.getLength(), Arrays.asList(carbonInputSplit.getLocations()), carbonInputSplit.getNumberOfBlocklets(), carbonInputSplit.getVersion().number(), carbonInputSplit.getDeleteDeltaFiles(), gson.toJson(carbonInputSplit.getDetailInfo())));
            }
        }
    } catch (IOException e) {
        throw new RuntimeException("Error creating Splits from CarbonTableInputFormat", e);
    }
    return result;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) Gson(com.facebook.presto.hadoop.$internal.com.google.gson.Gson) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) IOException(java.io.IOException) CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) CarbonTableInputFormat(org.apache.carbondata.hadoop.api.CarbonTableInputFormat) TableInfo(org.apache.carbondata.core.metadata.schema.table.TableInfo) Job(org.apache.hadoop.mapreduce.Job) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit)

Example 22 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonTableInputFormat method getSplits.

/**
 * get list of block/blocklet and make them to CarbonInputSplit
 * @param job JobContext with Configuration
 * @return list of CarbonInputSplit
 * @throws IOException
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    carbonTable = getOrCreateCarbonTable(job.getConfiguration());
    if (null == carbonTable) {
        throw new IOException("Missing/Corrupt schema file for table.");
    }
    // global dictionary is not supported since 2.0
    if (carbonTable.getTableInfo().getFactTable().getTableProperties().containsKey(CarbonCommonConstants.DICTIONARY_INCLUDE)) {
        DeprecatedFeatureException.globalDictNotSupported();
    }
    List<InputSplit> splits = new LinkedList<>();
    if (CarbonProperties.isQueryStageInputEnabled()) {
        // included for the query
        try {
            List<InputSplit> stageInputSplits = StageInputCollector.createInputSplits(carbonTable, job.getConfiguration());
            splits.addAll(stageInputSplits);
        } catch (ExecutionException | InterruptedException e) {
            LOG.error("Failed to create input splits from stage files", e);
            throw new IOException(e);
        }
    }
    this.readCommittedScope = getReadCommitted(job, carbonTable.getAbsoluteTableIdentifier());
    LoadMetadataDetails[] loadMetadataDetails = readCommittedScope.getSegmentList();
    String updateDeltaVersion = job.getConfiguration().get(UPDATE_DELTA_VERSION);
    SegmentUpdateStatusManager updateStatusManager;
    if (updateDeltaVersion != null) {
        updateStatusManager = new SegmentUpdateStatusManager(carbonTable, loadMetadataDetails, updateDeltaVersion);
    } else {
        updateStatusManager = new SegmentUpdateStatusManager(carbonTable, loadMetadataDetails);
    }
    List<String> invalidSegmentIds = new ArrayList<>();
    List<Segment> streamSegments = null;
    // get all valid segments and set them into the configuration
    SegmentStatusManager segmentStatusManager = new SegmentStatusManager(carbonTable.getAbsoluteTableIdentifier(), readCommittedScope.getConfiguration());
    SegmentStatusManager.ValidAndInvalidSegmentsInfo segments = segmentStatusManager.getValidAndInvalidSegments(carbonTable.isMV(), loadMetadataDetails, this.readCommittedScope);
    if (getValidateSegmentsToAccess(job.getConfiguration())) {
        List<Segment> validSegments = segments.getValidSegments();
        streamSegments = segments.getStreamSegments();
        streamSegments = getFilteredSegment(job, streamSegments, true, readCommittedScope);
        if (validSegments.size() == 0) {
            splits.addAll(getSplitsOfStreaming(job, streamSegments, carbonTable));
            return splits;
        }
        List<Segment> filteredSegmentToAccess = getFilteredSegment(job, segments.getValidSegments(), true, readCommittedScope);
        if (filteredSegmentToAccess.size() == 0) {
            splits.addAll(getSplitsOfStreaming(job, streamSegments, carbonTable));
            return splits;
        } else {
            setSegmentsToAccess(job.getConfiguration(), filteredSegmentToAccess);
        }
        // remove entry in the segment index if there are invalid segments
        for (Segment segment : segments.getInvalidSegments()) {
            invalidSegmentIds.add(segment.getSegmentNo());
        }
        if (invalidSegmentIds.size() > 0) {
            IndexStoreManager.getInstance().clearInvalidSegments(getOrCreateCarbonTable(job.getConfiguration()), invalidSegmentIds);
        }
    }
    List<Segment> validAndInProgressSegments = new ArrayList<>(segments.getValidSegments());
    // Add in progress segments also to filter it as in case of Secondary Index table load it loads
    // data from in progress table.
    validAndInProgressSegments.addAll(segments.getListOfInProgressSegments());
    List<Segment> segmentToAccess = getFilteredSegment(job, validAndInProgressSegments, false, readCommittedScope);
    String segmentFileName = job.getConfiguration().get(CarbonCommonConstants.CURRENT_SEGMENTFILE);
    if (segmentFileName != null) {
        // per segment it has only one file("current.segment")
        segmentToAccess.get(0).setSegmentFileName(segmentFileName + CarbonTablePath.SEGMENT_EXT);
    }
    // process and resolve the expression
    IndexFilter indexFilter = getFilterPredicates(job.getConfiguration());
    if (indexFilter != null) {
        indexFilter.resolve(false);
    }
    // do block filtering and get split
    List<InputSplit> batchSplits = getSplits(job, indexFilter, segmentToAccess, updateStatusManager, segments.getInvalidSegments());
    splits.addAll(batchSplits);
    // add all splits of streaming
    List<InputSplit> splitsOfStreaming = getSplitsOfStreaming(job, streamSegments, carbonTable);
    if (!splitsOfStreaming.isEmpty()) {
        splits.addAll(splitsOfStreaming);
    }
    return splits;
}
Also used : SegmentUpdateStatusManager(org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager) LoadMetadataDetails(org.apache.carbondata.core.statusmanager.LoadMetadataDetails) ArrayList(java.util.ArrayList) SegmentStatusManager(org.apache.carbondata.core.statusmanager.SegmentStatusManager) IOException(java.io.IOException) LinkedList(java.util.LinkedList) Segment(org.apache.carbondata.core.index.Segment) IndexFilter(org.apache.carbondata.core.index.IndexFilter) ExecutionException(java.util.concurrent.ExecutionException) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit)

Example 23 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class StreamRecordReader method initialize.

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
    // input
    if (split instanceof CarbonInputSplit) {
        fileSplit = (CarbonInputSplit) split;
    } else if (split instanceof CarbonMultiBlockSplit) {
        fileSplit = ((CarbonMultiBlockSplit) split).getAllSplits().get(0);
    } else {
        fileSplit = (FileSplit) split;
    }
    // metadata
    hadoopConf = context.getConfiguration();
    if (model == null) {
        CarbonTableInputFormat<Object> format = new CarbonTableInputFormat<>();
        model = format.createQueryModel(split, context);
    }
    carbonTable = model.getTable();
    List<CarbonDimension> dimensions = carbonTable.getVisibleDimensions();
    dimensionCount = dimensions.size();
    List<CarbonMeasure> measures = carbonTable.getVisibleMeasures();
    measureCount = measures.size();
    List<CarbonColumn> carbonColumnList = carbonTable.getStreamStorageOrderColumn();
    storageColumns = carbonColumnList.toArray(new CarbonColumn[carbonColumnList.size()]);
    isNoDictColumn = CarbonDataProcessorUtil.getNoDictionaryMapping(storageColumns);
    directDictionaryGenerators = new DirectDictionaryGenerator[storageColumns.length];
    for (int i = 0; i < storageColumns.length; i++) {
        if (storageColumns[i].getDataType() == DataTypes.DATE) {
            directDictionaryGenerators[i] = DirectDictionaryKeyGeneratorFactory.getDirectDictionaryGenerator(storageColumns[i].getDataType());
        }
    }
    dimensionsIsVarcharTypeMap = new boolean[dimensionCount];
    for (int i = 0; i < dimensionCount; i++) {
        dimensionsIsVarcharTypeMap[i] = storageColumns[i].getDataType() == DataTypes.VARCHAR;
    }
    measureDataTypes = new DataType[measureCount];
    for (int i = 0; i < measureCount; i++) {
        measureDataTypes[i] = storageColumns[dimensionCount + i].getDataType();
    }
    // decode data
    allNonNull = new BitSet(storageColumns.length);
    projection = model.getProjectionColumns();
    isRequired = new boolean[storageColumns.length];
    boolean[] isFilterDimensions = model.getIsFilterDimensions();
    boolean[] isFilterMeasures = model.getIsFilterMeasures();
    isFilterRequired = new boolean[storageColumns.length];
    filterMap = new int[storageColumns.length];
    for (int i = 0; i < storageColumns.length; i++) {
        if (storageColumns[i].isDimension()) {
            if (isFilterDimensions[storageColumns[i].getOrdinal()]) {
                isRequired[i] = true;
                isFilterRequired[i] = true;
                filterMap[i] = storageColumns[i].getOrdinal();
            }
        } else {
            if (isFilterMeasures[storageColumns[i].getOrdinal()]) {
                isRequired[i] = true;
                isFilterRequired[i] = true;
                filterMap[i] = carbonTable.getDimensionOrdinalMax() + storageColumns[i].getOrdinal();
            }
        }
    }
    isProjectionRequired = new boolean[storageColumns.length];
    projectionMap = new int[storageColumns.length];
    for (int j = 0; j < projection.length; j++) {
        for (int i = 0; i < storageColumns.length; i++) {
            if (storageColumns[i].getColName().equals(projection[j].getColName())) {
                isRequired[i] = true;
                isProjectionRequired[i] = true;
                projectionMap[i] = j;
                break;
            }
        }
    }
    // initialize filter
    if (null != model.getIndexFilter()) {
        initializeFilter();
    } else if (projection.length == 0) {
        skipScanData = true;
    }
}
Also used : CarbonColumn(org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn) BitSet(java.util.BitSet) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) CarbonDimension(org.apache.carbondata.core.metadata.schema.table.column.CarbonDimension) CarbonMeasure(org.apache.carbondata.core.metadata.schema.table.column.CarbonMeasure) CarbonMultiBlockSplit(org.apache.carbondata.hadoop.CarbonMultiBlockSplit) CarbonTableInputFormat(org.apache.carbondata.hadoop.api.CarbonTableInputFormat)

Example 24 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonVectorizedRecordReader method initialize.

@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException {
    List<CarbonInputSplit> splitList;
    if (inputSplit instanceof CarbonInputSplit) {
        // Read the footer offset and set.
        CarbonInputSplit carbonInputSplit = ((CarbonInputSplit) inputSplit);
        String splitPath = carbonInputSplit.getFilePath();
        if ((null != carbonInputSplit.getDetailInfo() && carbonInputSplit.getDetailInfo().getBlockFooterOffset() == 0L) || (null == carbonInputSplit.getDetailInfo() && carbonInputSplit.getStart() == 0)) {
            FileReader reader = FileFactory.getFileHolder(FileFactory.getFileType(splitPath), taskAttemptContext.getConfiguration());
            ByteBuffer buffer = reader.readByteBuffer(FileFactory.getUpdatedFilePath(splitPath), ((CarbonInputSplit) inputSplit).getLength() - 8, 8);
            if (carbonInputSplit.getDetailInfo() == null) {
                carbonInputSplit.setStart(buffer.getLong());
            } else {
                carbonInputSplit.getDetailInfo().setBlockFooterOffset(buffer.getLong());
            }
            reader.finish();
        }
        splitList = new ArrayList<>(1);
        splitList.add((CarbonInputSplit) inputSplit);
    } else {
        throw new RuntimeException("unsupported input split type: " + inputSplit);
    }
    List<TableBlockInfo> tableBlockInfoList = CarbonInputSplit.createBlocks(splitList);
    queryModel.setTableBlockInfos(tableBlockInfoList);
    queryModel.setVectorReader(true);
    try {
        queryExecutor = QueryExecutorFactory.getQueryExecutor(queryModel, taskAttemptContext.getConfiguration());
        iterator = (AbstractDetailQueryResultIterator) queryExecutor.execute(queryModel);
        initBatch();
    } catch (Exception e) {
        LOGGER.error(e);
        throw e;
    }
}
Also used : TableBlockInfo(org.apache.carbondata.core.datastore.block.TableBlockInfo) FileReader(org.apache.carbondata.core.datastore.FileReader) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) ByteBuffer(java.nio.ByteBuffer) QueryExecutionException(org.apache.carbondata.core.scan.executor.exception.QueryExecutionException) IOException(java.io.IOException)

Example 25 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonInputFormat method getDataBlocksOfSegment.

/**
 * get data blocks of given segment
 */
protected List<CarbonInputSplit> getDataBlocksOfSegment(JobContext job, CarbonTable carbonTable, IndexFilter expression, List<Segment> validSegments, List<Segment> invalidSegments, List<String> segmentsToBeRefreshed) throws IOException {
    QueryStatisticsRecorder recorder = CarbonTimeStatisticsFactory.createDriverRecorder();
    QueryStatistic statistic = new QueryStatistic();
    List<ExtendedBlocklet> prunedBlocklets = getPrunedBlocklets(job, carbonTable, expression, validSegments, invalidSegments, segmentsToBeRefreshed);
    List<CarbonInputSplit> resultFilteredBlocks = new ArrayList<>();
    for (ExtendedBlocklet blocklet : prunedBlocklets) {
        // matchedPartitions variable will be null in two cases as follows
        // 1. the table is not a partition table
        // 2. the table is a partition table, and all partitions are matched by query
        // for partition table, the task id of carbondata file name is the partition id.
        // if this partition is not required, here will skip it.
        resultFilteredBlocks.add(blocklet.getInputSplit());
    }
    statistic.addStatistics(QueryStatisticsConstants.LOAD_BLOCKS_DRIVER, System.currentTimeMillis());
    recorder.recordStatisticsForDriver(statistic, job.getConfiguration().get("query.id"));
    return resultFilteredBlocks;
}
Also used : ArrayList(java.util.ArrayList) QueryStatisticsRecorder(org.apache.carbondata.core.stats.QueryStatisticsRecorder) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) ExtendedBlocklet(org.apache.carbondata.core.indexstore.ExtendedBlocklet) QueryStatistic(org.apache.carbondata.core.stats.QueryStatistic)

Aggregations

CarbonInputSplit (org.apache.carbondata.hadoop.CarbonInputSplit)33 ArrayList (java.util.ArrayList)17 IOException (java.io.IOException)15 InputSplit (org.apache.hadoop.mapreduce.InputSplit)10 CarbonTable (org.apache.carbondata.core.metadata.schema.table.CarbonTable)8 LinkedList (java.util.LinkedList)6 CarbonMultiBlockSplit (org.apache.carbondata.hadoop.CarbonMultiBlockSplit)6 IndexFilter (org.apache.carbondata.core.index.IndexFilter)5 CarbonTablePath (org.apache.carbondata.core.util.path.CarbonTablePath)5 HashMap (java.util.HashMap)4 HashSet (java.util.HashSet)4 List (java.util.List)4 TableBlockInfo (org.apache.carbondata.core.datastore.block.TableBlockInfo)4 PartitionSpec (org.apache.carbondata.core.indexstore.PartitionSpec)4 LoadMetadataDetails (org.apache.carbondata.core.statusmanager.LoadMetadataDetails)4 SegmentUpdateStatusManager (org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager)4 CarbonTableInputFormat (org.apache.carbondata.hadoop.api.CarbonTableInputFormat)4 Configuration (org.apache.hadoop.conf.Configuration)4 Path (org.apache.hadoop.fs.Path)4 Gson (com.google.gson.Gson)3