Search in sources :

Example 26 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonFileInputFormat method getSplits.

/**
 * {@inheritDoc}
 * Configurations FileInputFormat.INPUT_DIR, CarbonTableInputFormat.INPUT_SEGMENT_NUMBERS
 * are used to get table path to read.
 *
 * @return
 * @throws IOException
 */
private List<InputSplit> getSplits(JobContext job, IndexFilter indexFilter, List<Segment> validSegments) throws IOException {
    numSegments = validSegments.size();
    // for each segment fetch blocks matching filter in Driver BTree
    List<CarbonInputSplit> dataBlocksOfSegment = getDataBlocksOfSegment(job, carbonTable, indexFilter, validSegments, new ArrayList<>(), new ArrayList<>());
    numBlocks = dataBlocksOfSegment.size();
    List<String> allDeleteDeltaFiles = getAllDeleteDeltaFiles(carbonTable.getTablePath());
    if (CollectionUtils.isNotEmpty(allDeleteDeltaFiles)) {
        for (CarbonInputSplit split : dataBlocksOfSegment) {
            split.setDeleteDeltaFiles(getDeleteDeltaFiles(split.getFilePath(), allDeleteDeltaFiles));
        }
    }
    return new LinkedList<>(dataBlocksOfSegment);
}
Also used : CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) LinkedList(java.util.LinkedList)

Example 27 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonCompactionUtil method getOverallMinMax.

public static Object[] getOverallMinMax(CarbonInputSplit[] carbonInputSplits, CarbonColumn rangeCol, boolean isSortCol) {
    byte[] minVal = null;
    byte[] maxVal = null;
    int dictMinVal = Integer.MAX_VALUE;
    int dictMaxVal = Integer.MIN_VALUE;
    int idx = -1;
    DataType dataType = rangeCol.getDataType();
    Object[] minMaxVals = new Object[2];
    boolean isDictEncode = rangeCol.getDataType() == DataTypes.DATE;
    try {
        for (CarbonInputSplit split : carbonInputSplits) {
            DataFileFooter dataFileFooter = null;
            dataFileFooter = CarbonUtil.readMetadataFile(CarbonInputSplit.getTableBlockInfo(split), true);
            if (-1 == idx) {
                List<ColumnSchema> allColumns = dataFileFooter.getColumnInTable();
                for (int i = 0; i < allColumns.size(); i++) {
                    if (allColumns.get(i).getColumnName().equalsIgnoreCase(rangeCol.getColName())) {
                        idx = i;
                        break;
                    }
                }
            }
            if (isDictEncode) {
                byte[] tempMin = dataFileFooter.getBlockletIndex().getMinMaxIndex().getMinValues()[idx];
                int tempMinVal = CarbonUtil.getSurrogateInternal(tempMin, 0, tempMin.length);
                byte[] tempMax = dataFileFooter.getBlockletIndex().getMinMaxIndex().getMaxValues()[idx];
                int tempMaxVal = CarbonUtil.getSurrogateInternal(tempMax, 0, tempMax.length);
                if (dictMinVal > tempMinVal) {
                    dictMinVal = tempMinVal;
                }
                if (dictMaxVal < tempMaxVal) {
                    dictMaxVal = tempMaxVal;
                }
            } else {
                if (null == minVal) {
                    minVal = dataFileFooter.getBlockletIndex().getMinMaxIndex().getMinValues()[idx];
                    maxVal = dataFileFooter.getBlockletIndex().getMinMaxIndex().getMaxValues()[idx];
                } else {
                    byte[] tempMin = dataFileFooter.getBlockletIndex().getMinMaxIndex().getMinValues()[idx];
                    byte[] tempMax = dataFileFooter.getBlockletIndex().getMinMaxIndex().getMaxValues()[idx];
                    if (ByteUtil.compare(tempMin, minVal) <= 0) {
                        minVal = tempMin;
                    }
                    if (ByteUtil.compare(tempMax, maxVal) >= 0) {
                        maxVal = tempMax;
                    }
                }
            }
        }
        // Based on how min/max value is stored in the footer we change the data
        if (isDictEncode) {
            minMaxVals[0] = dictMinVal;
            minMaxVals[1] = dictMaxVal;
        } else {
            if (!isSortCol && (dataType == DataTypes.INT || dataType == DataTypes.LONG)) {
                minMaxVals[0] = ByteUtil.toLong(minVal, 0, minVal.length);
                minMaxVals[1] = ByteUtil.toLong(maxVal, 0, maxVal.length);
            } else if (dataType == DataTypes.DOUBLE) {
                minMaxVals[0] = ByteUtil.toDouble(minVal, 0, minVal.length);
                minMaxVals[1] = ByteUtil.toDouble(maxVal, 0, maxVal.length);
            } else {
                minMaxVals[0] = DataTypeUtil.getDataBasedOnDataTypeForNoDictionaryColumn(minVal, dataType, true);
                minMaxVals[1] = DataTypeUtil.getDataBasedOnDataTypeForNoDictionaryColumn(maxVal, dataType, true);
            }
        }
    } catch (IOException e) {
        LOGGER.error(e.getMessage());
    }
    return minMaxVals;
}
Also used : DataFileFooter(org.apache.carbondata.core.metadata.blocklet.DataFileFooter) DataType(org.apache.carbondata.core.metadata.datatype.DataType) ColumnSchema(org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) IOException(java.io.IOException)

Example 28 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonTableReader method getInputSplits.

/**
 * Get a carbon muti-block input splits
 *
 * @param tableCacheModel cached table
 * @param filters carbonData filters
 * @param filteredPartitions matched partitionSpec for the filter
 * @param config hadoop conf
 * @return list of multiblock split
 * @throws IOException
 */
public List<CarbonLocalMultiBlockSplit> getInputSplits(CarbonTableCacheModel tableCacheModel, Expression filters, List<PartitionSpec> filteredPartitions, Configuration config) throws IOException {
    List<CarbonLocalInputSplit> result = new ArrayList<>();
    List<CarbonLocalMultiBlockSplit> multiBlockSplitList = new ArrayList<>();
    CarbonTable carbonTable = tableCacheModel.getCarbonTable();
    TableInfo tableInfo = tableCacheModel.getCarbonTable().getTableInfo();
    config.set("presto.cli.query.id", prestoQueryId);
    config.set(CarbonTableInputFormat.INPUT_SEGMENT_NUMBERS, "");
    String carbonTablePath = carbonTable.getAbsoluteTableIdentifier().getTablePath();
    config.set(CarbonTableInputFormat.INPUT_DIR, carbonTablePath);
    config.set(CarbonTableInputFormat.DATABASE_NAME, carbonTable.getDatabaseName());
    config.set(CarbonTableInputFormat.TABLE_NAME, carbonTable.getTableName());
    config.set("query.id", queryId);
    CarbonInputFormat.setTransactionalTable(config, carbonTable.isTransactionalTable());
    CarbonInputFormat.setTableInfo(config, carbonTable.getTableInfo());
    if (CarbonProperties.getInstance().isCoarseGrainSecondaryIndex(tableInfo.getDatabaseName(), tableInfo.getFactTable().getTableName(), "true")) {
        CarbonInputFormat.checkAndSetSecondaryIndexPruning(carbonTable.getTableInfo(), filters, config);
    }
    JobConf jobConf = new JobConf(config);
    try {
        CarbonTableInputFormat.setTableInfo(config, tableInfo);
        CarbonTableInputFormat<Object> carbonTableInputFormat = createInputFormat(jobConf, carbonTable.getAbsoluteTableIdentifier(), new IndexFilter(carbonTable, filters, true), filteredPartitions);
        Job job = Job.getInstance(jobConf);
        List<InputSplit> splits = carbonTableInputFormat.getSplits(job);
        Gson gson = new Gson();
        if (splits != null && splits.size() > 0) {
            for (InputSplit inputSplit : splits) {
                CarbonInputSplit carbonInputSplit = (CarbonInputSplit) inputSplit;
                result.add(new CarbonLocalInputSplit(carbonInputSplit.getSegmentId(), carbonInputSplit.getPath().toString(), carbonInputSplit.getStart(), carbonInputSplit.getLength(), Arrays.asList(carbonInputSplit.getLocations()), carbonInputSplit.getNumberOfBlocklets(), carbonInputSplit.getVersion().number(), carbonInputSplit.getDeleteDeltaFiles(), carbonInputSplit.getBlockletId(), gson.toJson(carbonInputSplit.getDetailInfo()), carbonInputSplit.getFileFormat().ordinal()));
            }
            // Use block distribution
            List<List<CarbonLocalInputSplit>> inputSplits = new ArrayList<>(result.stream().collect(Collectors.groupingBy(carbonInput -> {
                if (FileFormat.ROW_V1.equals(carbonInput.getFileFormat())) {
                    return carbonInput.getSegmentId().concat(carbonInput.getPath()).concat(carbonInput.getStart() + "");
                }
                return carbonInput.getSegmentId().concat(carbonInput.getPath());
            })).values());
            // TODO : try to optimize the below loic as it may slowdown for huge splits
            for (int j = 0; j < inputSplits.size(); j++) {
                multiBlockSplitList.add(new CarbonLocalMultiBlockSplit(inputSplits.get(j), inputSplits.get(j).stream().flatMap(f -> Arrays.stream(getLocations(f))).distinct().toArray(String[]::new)));
            }
            LOGGER.error("Size fo MultiblockList   " + multiBlockSplitList.size());
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return multiBlockSplitList;
}
Also used : CarbonMetadata(org.apache.carbondata.core.metadata.CarbonMetadata) Arrays(java.util.Arrays) Inject(com.google.inject.Inject) HiveColumnHandle(io.prestosql.plugin.hive.HiveColumnHandle) CarbonCommonConstants(org.apache.carbondata.core.constants.CarbonCommonConstants) SegmentFileStore(org.apache.carbondata.core.metadata.SegmentFileStore) Logger(org.apache.log4j.Logger) Gson(com.google.gson.Gson) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) TBase(org.apache.thrift.TBase) ACCESS_KEY(org.apache.hadoop.fs.s3a.Constants.ACCESS_KEY) IndexMetadata(org.apache.carbondata.core.metadata.schema.indextable.IndexMetadata) CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) Expression(org.apache.carbondata.core.scan.expression.Expression) ThriftWrapperSchemaConverterImpl(org.apache.carbondata.core.metadata.converter.ThriftWrapperSchemaConverterImpl) PrestoFilterUtil(org.apache.carbondata.presto.PrestoFilterUtil) CarbonInputFormat(org.apache.carbondata.hadoop.api.CarbonInputFormat) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Set(java.util.Set) Collectors(java.util.stream.Collectors) IndexStatus(org.apache.carbondata.core.index.status.IndexStatus) IndexType(org.apache.carbondata.core.metadata.index.IndexType) Objects(java.util.Objects) List(java.util.List) CarbonTableInputFormat(org.apache.carbondata.hadoop.api.CarbonTableInputFormat) ThriftReader(org.apache.carbondata.core.reader.ThriftReader) Job(org.apache.hadoop.mapreduce.Job) CarbonProperties(org.apache.carbondata.core.util.CarbonProperties) CarbonUtil(org.apache.carbondata.core.util.CarbonUtil) IndexTableInfo(org.apache.carbondata.core.metadata.schema.indextable.IndexTableInfo) TableInfo(org.apache.carbondata.core.metadata.schema.table.TableInfo) HashMap(java.util.HashMap) FileFactory(org.apache.carbondata.core.datastore.impl.FileFactory) AtomicReference(java.util.concurrent.atomic.AtomicReference) LoadMetadataDetails(org.apache.carbondata.core.statusmanager.LoadMetadataDetails) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) SchemaTableName(io.prestosql.spi.connector.SchemaTableName) CollectionUtils(org.apache.commons.collections.CollectionUtils) LogServiceFactory(org.apache.carbondata.common.logging.LogServiceFactory) IndexStoreManager(org.apache.carbondata.core.index.IndexStoreManager) CarbonFile(org.apache.carbondata.core.datastore.filesystem.CarbonFile) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonTablePath(org.apache.carbondata.core.util.path.CarbonTablePath) TupleDomain(io.prestosql.spi.predicate.TupleDomain) IOException(java.io.IOException) JobConf(org.apache.hadoop.mapred.JobConf) PartitionSpec(org.apache.carbondata.core.indexstore.PartitionSpec) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) FileFormat(org.apache.carbondata.core.statusmanager.FileFormat) AbsoluteTableIdentifier(org.apache.carbondata.core.metadata.AbsoluteTableIdentifier) SECRET_KEY(org.apache.hadoop.fs.s3a.Constants.SECRET_KEY) IndexFilter(org.apache.carbondata.core.index.IndexFilter) ENDPOINT(org.apache.hadoop.fs.s3a.Constants.ENDPOINT) SchemaConverter(org.apache.carbondata.core.metadata.converter.SchemaConverter) ArrayList(java.util.ArrayList) Gson(com.google.gson.Gson) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) IOException(java.io.IOException) CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) IndexTableInfo(org.apache.carbondata.core.metadata.schema.indextable.IndexTableInfo) TableInfo(org.apache.carbondata.core.metadata.schema.table.TableInfo) List(java.util.List) ArrayList(java.util.ArrayList) IndexFilter(org.apache.carbondata.core.index.IndexFilter) Job(org.apache.hadoop.mapreduce.Job) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit)

Example 29 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonReaderBuilder method totalRowCountInSplits.

private <T> void totalRowCountInSplits(Job job, List<InputSplit> splits, List<Long> rowCountInSplit) throws IOException, InterruptedException {
    CarbonFileInputFormat format = this.prepareFileInputFormat(job, false, true);
    long sum = 0;
    boolean isIUDTable = false;
    if (!StringUtils.isEmpty(this.tablePath)) {
        // Check if update or delete happened on the table.
        CarbonFile emptyMetadataFile = FileFactory.getCarbonFile(this.tablePath + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.CARBON_SDK_EMPTY_METADATA_PATH, this.hadoopConf);
        if (emptyMetadataFile.exists() && emptyMetadataFile.isDirectory()) {
            isIUDTable = true;
        }
    }
    // building carbon reader else get the row count from the details info of each splits.
    if (this.filterExpression != null || isIUDTable) {
        RecordReader reader = null;
        CarbonReader carbonReader = null;
        for (InputSplit split : splits) {
            List<RecordReader<Void, T>> readers = new ArrayList<>();
            try {
                reader = this.getRecordReader(job, format, readers, split);
                readers.add(reader);
                carbonReader = new CarbonReader<>(readers);
                while (carbonReader.hasNext()) {
                    try {
                        sum += carbonReader.readNextBatchRow().length;
                    } catch (Exception ex) {
                        LOGGER.error("Exception occured while reading the batch row " + ex.getMessage());
                    }
                }
                rowCountInSplit.add(sum);
            } finally {
                if (reader != null) {
                    reader.close();
                }
                if (carbonReader != null) {
                    carbonReader.close();
                }
            }
        }
    } else {
        for (InputSplit split : splits) {
            // prepare a summation array of row counts in each blocklet,
            // this is used for pruning with pagination vales.
            // At current index, it contains sum of rows of all the blocklet from previous + current.
            sum += ((CarbonInputSplit) split).getDetailInfo().getRowCount();
            rowCountInSplit.add(sum);
        }
    }
}
Also used : CarbonFile(org.apache.carbondata.core.datastore.filesystem.CarbonFile) CarbonVectorizedRecordReader(org.apache.carbondata.hadoop.util.CarbonVectorizedRecordReader) RecordReader(org.apache.hadoop.mapreduce.RecordReader) ArrayList(java.util.ArrayList) CarbonFileInputFormat(org.apache.carbondata.hadoop.api.CarbonFileInputFormat) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) IOException(java.io.IOException)

Example 30 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonReaderBuilder method getSplits.

/**
 * Gets an array of CarbonInputSplits.
 * In carbondata, splits can be block level or blocklet level.
 * by default splits are block level.
 *
 * @param enableBlockletDistribution, returns blocklet level splits if set to true,
 *                                    else block level splits.
 * @return
 * @throws IOException
 */
public InputSplit[] getSplits(boolean enableBlockletDistribution) throws IOException {
    if (hadoopConf == null) {
        hadoopConf = FileFactory.getConfiguration();
    }
    Job job = null;
    List<InputSplit> splits;
    CarbonFileInputFormat format = null;
    try {
        job = new Job(new JobConf(hadoopConf));
        format = prepareFileInputFormat(job, enableBlockletDistribution, false);
        splits = format.getSplits(new JobContextImpl(job.getConfiguration(), new JobID()));
        for (InputSplit split : splits) {
            // Load the detailInfo
            ((CarbonInputSplit) split).getDetailInfo();
        }
    } finally {
        if (format != null) {
            // Clear the index cache as it is added in getSplits() method
            IndexStoreManager.getInstance().clearIndexCache(format.getOrCreateCarbonTable((job.getConfiguration())).getAbsoluteTableIdentifier(), false);
        }
    }
    return splits.toArray(new InputSplit[splits.size()]);
}
Also used : JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) CarbonFileInputFormat(org.apache.carbondata.hadoop.api.CarbonFileInputFormat) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) JobConf(org.apache.hadoop.mapred.JobConf) JobID(org.apache.hadoop.mapreduce.JobID)

Aggregations

CarbonInputSplit (org.apache.carbondata.hadoop.CarbonInputSplit)33 ArrayList (java.util.ArrayList)17 IOException (java.io.IOException)15 InputSplit (org.apache.hadoop.mapreduce.InputSplit)10 CarbonTable (org.apache.carbondata.core.metadata.schema.table.CarbonTable)8 LinkedList (java.util.LinkedList)6 CarbonMultiBlockSplit (org.apache.carbondata.hadoop.CarbonMultiBlockSplit)6 IndexFilter (org.apache.carbondata.core.index.IndexFilter)5 CarbonTablePath (org.apache.carbondata.core.util.path.CarbonTablePath)5 HashMap (java.util.HashMap)4 HashSet (java.util.HashSet)4 List (java.util.List)4 TableBlockInfo (org.apache.carbondata.core.datastore.block.TableBlockInfo)4 PartitionSpec (org.apache.carbondata.core.indexstore.PartitionSpec)4 LoadMetadataDetails (org.apache.carbondata.core.statusmanager.LoadMetadataDetails)4 SegmentUpdateStatusManager (org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager)4 CarbonTableInputFormat (org.apache.carbondata.hadoop.api.CarbonTableInputFormat)4 Configuration (org.apache.hadoop.conf.Configuration)4 Path (org.apache.hadoop.fs.Path)4 Gson (com.google.gson.Gson)3