use of org.apache.carbondata.hadoop.api.CarbonFileInputFormat in project carbondata by apache.
the class CarbonReaderBuilder method build.
/**
* Build CarbonReader
*
* @param <T>
* @return CarbonReader
* @throws IOException
* @throws InterruptedException
*/
public <T> CarbonReader<T> build() throws IOException, InterruptedException {
if (inputSplit != null) {
return buildWithSplits(inputSplit);
}
if (hadoopConf == null) {
hadoopConf = FileFactory.getConfiguration();
}
CarbonTableInputFormat.setCarbonReadSupport(hadoopConf, readSupportClass);
final Job job = new Job(new JobConf(hadoopConf));
CarbonFileInputFormat format = null;
try {
if (!usePaginationReader) {
// block level dummy splits without IO and loading the cache (if filter not present)
format = prepareFileInputFormat(job, false, true);
List<InputSplit> splits = format.getSplits(new JobContextImpl(job.getConfiguration(), new JobID()));
List<RecordReader<Void, T>> readers = new ArrayList<>(splits.size());
for (InputSplit split : splits) {
RecordReader reader = getRecordReader(job, format, readers, split);
readers.add(reader);
}
if (useArrowReader) {
return new ArrowCarbonReader<>(readers);
}
return new CarbonReader<>(readers);
} else {
// blocklet level splits formed by reading footer and loading the cache
format = prepareFileInputFormat(job, true, false);
List<InputSplit> splits = format.getSplits(new JobContextImpl(job.getConfiguration(), new JobID()));
List<Long> rowCountInSplit = new ArrayList<>(splits.size());
totalRowCountInSplits(job, splits, rowCountInSplit);
return new PaginationCarbonReader(splits, this, rowCountInSplit);
}
} catch (Exception ex) {
if (format != null) {
// Clear the index cache as it can get added in getSplits() method
IndexStoreManager.getInstance().clearIndexCache(format.getOrCreateCarbonTable((job.getConfiguration())).getAbsoluteTableIdentifier(), false);
}
throw ex;
}
}
use of org.apache.carbondata.hadoop.api.CarbonFileInputFormat in project carbondata by apache.
the class CarbonReaderBuilder method prepareFileInputFormat.
private CarbonFileInputFormat prepareFileInputFormat(Job job, boolean enableBlockletDistribution, boolean disableLoadBlockIndex) throws IOException {
if (inputSplit != null && inputSplit instanceof CarbonInputSplit) {
tablePath = ((CarbonInputSplit) inputSplit).getSegment().getReadCommittedScope().getFilePath();
tableName = "UnknownTable" + UUID.randomUUID();
}
if (null == this.fileLists && null == tablePath) {
throw new IllegalArgumentException("Please set table path first.");
}
// infer schema
CarbonTable table;
if (null != this.fileLists) {
if (fileLists.size() < 1) {
throw new IllegalArgumentException("fileLists must have one file in list as least!");
}
String commonString = String.valueOf(fileLists.get(0));
for (int i = 1; i < fileLists.size(); i++) {
commonString = commonString.substring(0, StringUtils.indexOfDifference(commonString, String.valueOf(fileLists.get(i))));
}
int index = commonString.lastIndexOf("/");
commonString = commonString.substring(0, index);
table = CarbonTable.buildTable(commonString, tableName, hadoopConf);
} else {
table = CarbonTable.buildTable(tablePath, tableName, hadoopConf);
}
if (enableBlockletDistribution) {
// set cache level to blocklet level
Map<String, String> tableProperties = table.getTableInfo().getFactTable().getTableProperties();
tableProperties.put(CarbonCommonConstants.CACHE_LEVEL, "BLOCKLET");
table.getTableInfo().getFactTable().setTableProperties(tableProperties);
}
final CarbonFileInputFormat format = new CarbonFileInputFormat();
format.setTableInfo(job.getConfiguration(), table.getTableInfo());
format.setTablePath(job.getConfiguration(), table.getTablePath());
format.setTableName(job.getConfiguration(), table.getTableName());
format.setDatabaseName(job.getConfiguration(), table.getDatabaseName());
if (filterExpression != null) {
format.setFilterPredicates(job.getConfiguration(), new IndexFilter(table, filterExpression, true));
}
if (null != this.fileLists) {
format.setFileLists(this.fileLists);
}
if (projectionColumns != null) {
// set the user projection
int len = projectionColumns.length;
for (int i = 0; i < len; i++) {
if (projectionColumns[i].contains(".")) {
throw new UnsupportedOperationException("Complex child columns projection NOT supported through CarbonReader");
}
}
format.setColumnProjection(job.getConfiguration(), projectionColumns);
}
if ((disableLoadBlockIndex) && (filterExpression == null)) {
job.getConfiguration().set("filter_blocks", "false");
}
return format;
}
use of org.apache.carbondata.hadoop.api.CarbonFileInputFormat in project carbondata by apache.
the class CarbonReaderBuilder method buildWithSplits.
private <T> CarbonReader<T> buildWithSplits(InputSplit inputSplit) throws IOException, InterruptedException {
if (hadoopConf == null) {
hadoopConf = FileFactory.getConfiguration();
}
CarbonTableInputFormat.setCarbonReadSupport(hadoopConf, readSupportClass);
final Job job = new Job(new JobConf(hadoopConf));
CarbonFileInputFormat format = prepareFileInputFormat(job, false, true);
format.setAllColumnProjectionIfNotConfigured(job, format.getOrCreateCarbonTable(job.getConfiguration()));
try {
List<RecordReader<Void, T>> readers = new ArrayList<>(1);
RecordReader reader = getRecordReader(job, format, readers, inputSplit);
readers.add(reader);
if (useArrowReader) {
return new ArrowCarbonReader<>(readers);
} else {
return new CarbonReader<>(readers);
}
} catch (Exception ex) {
throw ex;
}
}
use of org.apache.carbondata.hadoop.api.CarbonFileInputFormat in project carbondata by apache.
the class CarbonReaderBuilder method totalRowCountInSplits.
private <T> void totalRowCountInSplits(Job job, List<InputSplit> splits, List<Long> rowCountInSplit) throws IOException, InterruptedException {
CarbonFileInputFormat format = this.prepareFileInputFormat(job, false, true);
long sum = 0;
boolean isIUDTable = false;
if (!StringUtils.isEmpty(this.tablePath)) {
// Check if update or delete happened on the table.
CarbonFile emptyMetadataFile = FileFactory.getCarbonFile(this.tablePath + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.CARBON_SDK_EMPTY_METADATA_PATH, this.hadoopConf);
if (emptyMetadataFile.exists() && emptyMetadataFile.isDirectory()) {
isIUDTable = true;
}
}
// building carbon reader else get the row count from the details info of each splits.
if (this.filterExpression != null || isIUDTable) {
RecordReader reader = null;
CarbonReader carbonReader = null;
for (InputSplit split : splits) {
List<RecordReader<Void, T>> readers = new ArrayList<>();
try {
reader = this.getRecordReader(job, format, readers, split);
readers.add(reader);
carbonReader = new CarbonReader<>(readers);
while (carbonReader.hasNext()) {
try {
sum += carbonReader.readNextBatchRow().length;
} catch (Exception ex) {
LOGGER.error("Exception occured while reading the batch row " + ex.getMessage());
}
}
rowCountInSplit.add(sum);
} finally {
if (reader != null) {
reader.close();
}
if (carbonReader != null) {
carbonReader.close();
}
}
}
} else {
for (InputSplit split : splits) {
// prepare a summation array of row counts in each blocklet,
// this is used for pruning with pagination vales.
// At current index, it contains sum of rows of all the blocklet from previous + current.
sum += ((CarbonInputSplit) split).getDetailInfo().getRowCount();
rowCountInSplit.add(sum);
}
}
}
use of org.apache.carbondata.hadoop.api.CarbonFileInputFormat in project carbondata by apache.
the class CarbonReaderBuilder method getSplits.
/**
* Gets an array of CarbonInputSplits.
* In carbondata, splits can be block level or blocklet level.
* by default splits are block level.
*
* @param enableBlockletDistribution, returns blocklet level splits if set to true,
* else block level splits.
* @return
* @throws IOException
*/
public InputSplit[] getSplits(boolean enableBlockletDistribution) throws IOException {
if (hadoopConf == null) {
hadoopConf = FileFactory.getConfiguration();
}
Job job = null;
List<InputSplit> splits;
CarbonFileInputFormat format = null;
try {
job = new Job(new JobConf(hadoopConf));
format = prepareFileInputFormat(job, enableBlockletDistribution, false);
splits = format.getSplits(new JobContextImpl(job.getConfiguration(), new JobID()));
for (InputSplit split : splits) {
// Load the detailInfo
((CarbonInputSplit) split).getDetailInfo();
}
} finally {
if (format != null) {
// Clear the index cache as it is added in getSplits() method
IndexStoreManager.getInstance().clearIndexCache(format.getOrCreateCarbonTable((job.getConfiguration())).getAbsoluteTableIdentifier(), false);
}
}
return splits.toArray(new InputSplit[splits.size()]);
}
Aggregations