Search in sources :

Example 6 with CarbonFileInputFormat

use of org.apache.carbondata.hadoop.api.CarbonFileInputFormat in project carbondata by apache.

the class CarbonReaderBuilder method build.

/**
 * Build CarbonReader
 *
 * @param <T>
 * @return CarbonReader
 * @throws IOException
 * @throws InterruptedException
 */
public <T> CarbonReader<T> build() throws IOException, InterruptedException {
    if (inputSplit != null) {
        return buildWithSplits(inputSplit);
    }
    if (hadoopConf == null) {
        hadoopConf = FileFactory.getConfiguration();
    }
    CarbonTableInputFormat.setCarbonReadSupport(hadoopConf, readSupportClass);
    final Job job = new Job(new JobConf(hadoopConf));
    CarbonFileInputFormat format = null;
    try {
        if (!usePaginationReader) {
            // block level dummy splits without IO and loading the cache (if filter not present)
            format = prepareFileInputFormat(job, false, true);
            List<InputSplit> splits = format.getSplits(new JobContextImpl(job.getConfiguration(), new JobID()));
            List<RecordReader<Void, T>> readers = new ArrayList<>(splits.size());
            for (InputSplit split : splits) {
                RecordReader reader = getRecordReader(job, format, readers, split);
                readers.add(reader);
            }
            if (useArrowReader) {
                return new ArrowCarbonReader<>(readers);
            }
            return new CarbonReader<>(readers);
        } else {
            // blocklet level splits formed by reading footer and loading the cache
            format = prepareFileInputFormat(job, true, false);
            List<InputSplit> splits = format.getSplits(new JobContextImpl(job.getConfiguration(), new JobID()));
            List<Long> rowCountInSplit = new ArrayList<>(splits.size());
            totalRowCountInSplits(job, splits, rowCountInSplit);
            return new PaginationCarbonReader(splits, this, rowCountInSplit);
        }
    } catch (Exception ex) {
        if (format != null) {
            // Clear the index cache as it can get added in getSplits() method
            IndexStoreManager.getInstance().clearIndexCache(format.getOrCreateCarbonTable((job.getConfiguration())).getAbsoluteTableIdentifier(), false);
        }
        throw ex;
    }
}
Also used : JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) CarbonVectorizedRecordReader(org.apache.carbondata.hadoop.util.CarbonVectorizedRecordReader) RecordReader(org.apache.hadoop.mapreduce.RecordReader) ArrayList(java.util.ArrayList) CarbonFileInputFormat(org.apache.carbondata.hadoop.api.CarbonFileInputFormat) IOException(java.io.IOException) Job(org.apache.hadoop.mapreduce.Job) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) JobID(org.apache.hadoop.mapreduce.JobID)

Example 7 with CarbonFileInputFormat

use of org.apache.carbondata.hadoop.api.CarbonFileInputFormat in project carbondata by apache.

the class CarbonReaderBuilder method prepareFileInputFormat.

private CarbonFileInputFormat prepareFileInputFormat(Job job, boolean enableBlockletDistribution, boolean disableLoadBlockIndex) throws IOException {
    if (inputSplit != null && inputSplit instanceof CarbonInputSplit) {
        tablePath = ((CarbonInputSplit) inputSplit).getSegment().getReadCommittedScope().getFilePath();
        tableName = "UnknownTable" + UUID.randomUUID();
    }
    if (null == this.fileLists && null == tablePath) {
        throw new IllegalArgumentException("Please set table path first.");
    }
    // infer schema
    CarbonTable table;
    if (null != this.fileLists) {
        if (fileLists.size() < 1) {
            throw new IllegalArgumentException("fileLists must have one file in list as least!");
        }
        String commonString = String.valueOf(fileLists.get(0));
        for (int i = 1; i < fileLists.size(); i++) {
            commonString = commonString.substring(0, StringUtils.indexOfDifference(commonString, String.valueOf(fileLists.get(i))));
        }
        int index = commonString.lastIndexOf("/");
        commonString = commonString.substring(0, index);
        table = CarbonTable.buildTable(commonString, tableName, hadoopConf);
    } else {
        table = CarbonTable.buildTable(tablePath, tableName, hadoopConf);
    }
    if (enableBlockletDistribution) {
        // set cache level to blocklet level
        Map<String, String> tableProperties = table.getTableInfo().getFactTable().getTableProperties();
        tableProperties.put(CarbonCommonConstants.CACHE_LEVEL, "BLOCKLET");
        table.getTableInfo().getFactTable().setTableProperties(tableProperties);
    }
    final CarbonFileInputFormat format = new CarbonFileInputFormat();
    format.setTableInfo(job.getConfiguration(), table.getTableInfo());
    format.setTablePath(job.getConfiguration(), table.getTablePath());
    format.setTableName(job.getConfiguration(), table.getTableName());
    format.setDatabaseName(job.getConfiguration(), table.getDatabaseName());
    if (filterExpression != null) {
        format.setFilterPredicates(job.getConfiguration(), new IndexFilter(table, filterExpression, true));
    }
    if (null != this.fileLists) {
        format.setFileLists(this.fileLists);
    }
    if (projectionColumns != null) {
        // set the user projection
        int len = projectionColumns.length;
        for (int i = 0; i < len; i++) {
            if (projectionColumns[i].contains(".")) {
                throw new UnsupportedOperationException("Complex child columns projection NOT supported through CarbonReader");
            }
        }
        format.setColumnProjection(job.getConfiguration(), projectionColumns);
    }
    if ((disableLoadBlockIndex) && (filterExpression == null)) {
        job.getConfiguration().set("filter_blocks", "false");
    }
    return format;
}
Also used : CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) CarbonFileInputFormat(org.apache.carbondata.hadoop.api.CarbonFileInputFormat) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) IndexFilter(org.apache.carbondata.core.index.IndexFilter)

Example 8 with CarbonFileInputFormat

use of org.apache.carbondata.hadoop.api.CarbonFileInputFormat in project carbondata by apache.

the class CarbonReaderBuilder method buildWithSplits.

private <T> CarbonReader<T> buildWithSplits(InputSplit inputSplit) throws IOException, InterruptedException {
    if (hadoopConf == null) {
        hadoopConf = FileFactory.getConfiguration();
    }
    CarbonTableInputFormat.setCarbonReadSupport(hadoopConf, readSupportClass);
    final Job job = new Job(new JobConf(hadoopConf));
    CarbonFileInputFormat format = prepareFileInputFormat(job, false, true);
    format.setAllColumnProjectionIfNotConfigured(job, format.getOrCreateCarbonTable(job.getConfiguration()));
    try {
        List<RecordReader<Void, T>> readers = new ArrayList<>(1);
        RecordReader reader = getRecordReader(job, format, readers, inputSplit);
        readers.add(reader);
        if (useArrowReader) {
            return new ArrowCarbonReader<>(readers);
        } else {
            return new CarbonReader<>(readers);
        }
    } catch (Exception ex) {
        throw ex;
    }
}
Also used : CarbonVectorizedRecordReader(org.apache.carbondata.hadoop.util.CarbonVectorizedRecordReader) RecordReader(org.apache.hadoop.mapreduce.RecordReader) ArrayList(java.util.ArrayList) CarbonFileInputFormat(org.apache.carbondata.hadoop.api.CarbonFileInputFormat) Job(org.apache.hadoop.mapreduce.Job) JobConf(org.apache.hadoop.mapred.JobConf) IOException(java.io.IOException)

Example 9 with CarbonFileInputFormat

use of org.apache.carbondata.hadoop.api.CarbonFileInputFormat in project carbondata by apache.

the class CarbonReaderBuilder method totalRowCountInSplits.

private <T> void totalRowCountInSplits(Job job, List<InputSplit> splits, List<Long> rowCountInSplit) throws IOException, InterruptedException {
    CarbonFileInputFormat format = this.prepareFileInputFormat(job, false, true);
    long sum = 0;
    boolean isIUDTable = false;
    if (!StringUtils.isEmpty(this.tablePath)) {
        // Check if update or delete happened on the table.
        CarbonFile emptyMetadataFile = FileFactory.getCarbonFile(this.tablePath + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.CARBON_SDK_EMPTY_METADATA_PATH, this.hadoopConf);
        if (emptyMetadataFile.exists() && emptyMetadataFile.isDirectory()) {
            isIUDTable = true;
        }
    }
    // building carbon reader else get the row count from the details info of each splits.
    if (this.filterExpression != null || isIUDTable) {
        RecordReader reader = null;
        CarbonReader carbonReader = null;
        for (InputSplit split : splits) {
            List<RecordReader<Void, T>> readers = new ArrayList<>();
            try {
                reader = this.getRecordReader(job, format, readers, split);
                readers.add(reader);
                carbonReader = new CarbonReader<>(readers);
                while (carbonReader.hasNext()) {
                    try {
                        sum += carbonReader.readNextBatchRow().length;
                    } catch (Exception ex) {
                        LOGGER.error("Exception occured while reading the batch row " + ex.getMessage());
                    }
                }
                rowCountInSplit.add(sum);
            } finally {
                if (reader != null) {
                    reader.close();
                }
                if (carbonReader != null) {
                    carbonReader.close();
                }
            }
        }
    } else {
        for (InputSplit split : splits) {
            // prepare a summation array of row counts in each blocklet,
            // this is used for pruning with pagination vales.
            // At current index, it contains sum of rows of all the blocklet from previous + current.
            sum += ((CarbonInputSplit) split).getDetailInfo().getRowCount();
            rowCountInSplit.add(sum);
        }
    }
}
Also used : CarbonFile(org.apache.carbondata.core.datastore.filesystem.CarbonFile) CarbonVectorizedRecordReader(org.apache.carbondata.hadoop.util.CarbonVectorizedRecordReader) RecordReader(org.apache.hadoop.mapreduce.RecordReader) ArrayList(java.util.ArrayList) CarbonFileInputFormat(org.apache.carbondata.hadoop.api.CarbonFileInputFormat) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) IOException(java.io.IOException)

Example 10 with CarbonFileInputFormat

use of org.apache.carbondata.hadoop.api.CarbonFileInputFormat in project carbondata by apache.

the class CarbonReaderBuilder method getSplits.

/**
 * Gets an array of CarbonInputSplits.
 * In carbondata, splits can be block level or blocklet level.
 * by default splits are block level.
 *
 * @param enableBlockletDistribution, returns blocklet level splits if set to true,
 *                                    else block level splits.
 * @return
 * @throws IOException
 */
public InputSplit[] getSplits(boolean enableBlockletDistribution) throws IOException {
    if (hadoopConf == null) {
        hadoopConf = FileFactory.getConfiguration();
    }
    Job job = null;
    List<InputSplit> splits;
    CarbonFileInputFormat format = null;
    try {
        job = new Job(new JobConf(hadoopConf));
        format = prepareFileInputFormat(job, enableBlockletDistribution, false);
        splits = format.getSplits(new JobContextImpl(job.getConfiguration(), new JobID()));
        for (InputSplit split : splits) {
            // Load the detailInfo
            ((CarbonInputSplit) split).getDetailInfo();
        }
    } finally {
        if (format != null) {
            // Clear the index cache as it is added in getSplits() method
            IndexStoreManager.getInstance().clearIndexCache(format.getOrCreateCarbonTable((job.getConfiguration())).getAbsoluteTableIdentifier(), false);
        }
    }
    return splits.toArray(new InputSplit[splits.size()]);
}
Also used : JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) CarbonFileInputFormat(org.apache.carbondata.hadoop.api.CarbonFileInputFormat) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) JobConf(org.apache.hadoop.mapred.JobConf) JobID(org.apache.hadoop.mapreduce.JobID)

Aggregations

CarbonFileInputFormat (org.apache.carbondata.hadoop.api.CarbonFileInputFormat)15 JobConf (org.apache.hadoop.mapred.JobConf)13 InputSplit (org.apache.hadoop.mapreduce.InputSplit)13 Job (org.apache.hadoop.mapreduce.Job)13 IndexFilter (org.apache.carbondata.core.index.IndexFilter)11 Configuration (org.apache.hadoop.conf.Configuration)10 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)10 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)10 ExprNodeGenericFuncDesc (org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc)10 Test (org.junit.Test)10 ExprNodeConstantDesc (org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc)8 CarbonInputSplit (org.apache.carbondata.hadoop.CarbonInputSplit)4 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 CarbonVectorizedRecordReader (org.apache.carbondata.hadoop.util.CarbonVectorizedRecordReader)3 RecordReader (org.apache.hadoop.mapreduce.RecordReader)3 GenericUDFOPEqual (org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual)2 JobID (org.apache.hadoop.mapreduce.JobID)2 JobContextImpl (org.apache.hadoop.mapreduce.task.JobContextImpl)2 CarbonFile (org.apache.carbondata.core.datastore.filesystem.CarbonFile)1