use of org.apache.carbondata.core.index.Segment in project carbondata by apache.
the class CarbonFileInputFormat method getSplits.
/**
* get list of block/blocklet and make them to CarbonInputSplit
* @param job JobContext with Configuration
* @return list of CarbonInputSplit
* @throws IOException
*/
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
CarbonTable carbonTable = getOrCreateCarbonTable(job.getConfiguration());
if (null == carbonTable) {
throw new IOException("Missing/Corrupt schema file for table.");
}
AbsoluteTableIdentifier identifier = carbonTable.getAbsoluteTableIdentifier();
// get all valid segments and set them into the configuration
// check for externalTable segment (Segment_null)
// process and resolve the expression
ReadCommittedScope readCommittedScope;
if (carbonTable.isTransactionalTable()) {
readCommittedScope = new LatestFilesReadCommittedScope(identifier.getTablePath() + "/Fact/Part0/Segment_null/", job.getConfiguration());
} else {
readCommittedScope = getReadCommittedScope(job.getConfiguration());
if (readCommittedScope == null) {
readCommittedScope = new LatestFilesReadCommittedScope(identifier.getTablePath(), job.getConfiguration());
} else {
readCommittedScope.setConfiguration(job.getConfiguration());
}
}
// this will be null in case of corrupt schema file.
IndexFilter filter = getFilterPredicates(job.getConfiguration());
// if external table Segments are found, add it to the List
List<Segment> externalTableSegments = new ArrayList<>();
Segment seg;
if (carbonTable.isTransactionalTable()) {
// SDK some cases write into the Segment Path instead of Table Path i.e. inside
// the "Fact/Part0/Segment_null". The segment in this case is named as "null".
// The table is denoted by default as a transactional table and goes through
// the path of CarbonFileInputFormat. The above scenario is handled in the below code.
seg = new Segment("null", null, readCommittedScope);
externalTableSegments.add(seg);
} else {
LoadMetadataDetails[] loadMetadataDetails = readCommittedScope.getSegmentList();
for (LoadMetadataDetails load : loadMetadataDetails) {
seg = new Segment(load.getLoadName(), null, readCommittedScope);
if (fileLists != null) {
for (Object fileList : fileLists) {
String timestamp = CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileList.toString());
if (timestamp.equals(seg.getSegmentNo())) {
externalTableSegments.add(seg);
break;
}
}
} else {
externalTableSegments.add(seg);
}
}
}
List<InputSplit> splits = new ArrayList<>();
boolean useBlockIndex = job.getConfiguration().getBoolean("filter_blocks", true);
// scenarios
if (filter != null) {
filter.resolve(false);
}
if (useBlockIndex) {
// do block filtering and get split
splits = getSplits(job, filter, externalTableSegments);
} else {
List<CarbonFile> carbonFiles;
if (null != this.fileLists) {
carbonFiles = getAllCarbonDataFiles(this.fileLists);
} else {
carbonFiles = getAllCarbonDataFiles(carbonTable.getTablePath());
}
List<String> allDeleteDeltaFiles = getAllDeleteDeltaFiles(carbonTable.getTablePath());
for (CarbonFile carbonFile : carbonFiles) {
// Segment id is set to null because SDK does not write carbondata files with respect
// to segments. So no specific name is present for this load.
CarbonInputSplit split = new CarbonInputSplit("null", carbonFile.getAbsolutePath(), 0, carbonFile.getLength(), carbonFile.getLocations(), FileFormat.COLUMNAR_V3);
split.setVersion(ColumnarFormatVersion.V3);
BlockletDetailInfo info = new BlockletDetailInfo();
split.setDetailInfo(info);
info.setBlockSize(carbonFile.getLength());
info.setVersionNumber(split.getVersion().number());
info.setUseMinMaxForPruning(false);
if (CollectionUtils.isNotEmpty(allDeleteDeltaFiles)) {
split.setDeleteDeltaFiles(getDeleteDeltaFiles(carbonFile.getAbsolutePath(), allDeleteDeltaFiles));
}
splits.add(split);
}
splits.sort(Comparator.comparing(o -> ((CarbonInputSplit) o).getFilePath()));
}
setAllColumnProjectionIfNotConfigured(job, carbonTable);
return splits;
}
use of org.apache.carbondata.core.index.Segment in project carbondata by apache.
the class CarbonOutputCommitter method commitJobFinal.
private void commitJobFinal(JobContext context, CarbonLoadModel loadModel, OperationContext operationContext, CarbonTable carbonTable, String uniqueId) throws IOException {
if (operationContext != null) {
LoadEvents.LoadTablePostStatusUpdateEvent postStatusUpdateEvent = new LoadEvents.LoadTablePostStatusUpdateEvent(loadModel);
try {
OperationListenerBus.getInstance().fireEvent(postStatusUpdateEvent, operationContext);
} catch (Exception e) {
throw new IOException(e);
}
}
String updateTime = context.getConfiguration().get(CarbonTableOutputFormat.UPDATE_TIMESTAMP, uniqueId);
String segmentsToBeDeleted = context.getConfiguration().get(CarbonTableOutputFormat.SEGMENTS_TO_BE_DELETED, "");
List<Segment> segmentDeleteList = Collections.emptyList();
if (!segmentsToBeDeleted.trim().isEmpty()) {
segmentDeleteList = Segment.toSegmentList(segmentsToBeDeleted.split(","), null);
}
boolean isUpdateStatusFileUpdateRequired = (context.getConfiguration().get(CarbonTableOutputFormat.UPDATE_TIMESTAMP) != null);
if (updateTime != null) {
CarbonUpdateUtil.updateTableMetadataStatus(Collections.singleton(loadModel.getSegment()), carbonTable, updateTime, true, isUpdateStatusFileUpdateRequired, segmentDeleteList);
}
}
use of org.apache.carbondata.core.index.Segment in project carbondata by apache.
the class CarbonOutputCommitter method overwritePartitions.
/**
* Overwrite the partitions in case of overwrite query. It just updates the partition map files
* of all segment files.
*/
private String overwritePartitions(CarbonLoadModel loadModel, LoadMetadataDetails newMetaEntry, String uuid, List<String> partitionList, List<PartitionSpec> currentPartitionsOfTable) throws IOException {
CarbonTable table = loadModel.getCarbonDataLoadSchema().getCarbonTable();
if (partitionList != null && partitionList.size() > 0) {
// check if any partitions overlaps
List<String> overlappingPartitions = currentPartitionsOfTable.stream().map(partitionSpec -> partitionSpec.getLocation().toString()).filter(partitionList::contains).collect(Collectors.toList());
if (!overlappingPartitions.isEmpty()) {
List<LoadMetadataDetails> validLoadMetadataDetails = loadModel.getLoadMetadataDetails().stream().filter(loadMetadataDetail -> !loadMetadataDetail.getLoadName().equalsIgnoreCase(newMetaEntry.getLoadName())).collect(Collectors.toList());
String uniqueId = String.valueOf(System.currentTimeMillis());
List<String> toBeUpdatedSegments = new ArrayList<>(validLoadMetadataDetails.size());
List<String> toBeDeletedSegments = new ArrayList<>(validLoadMetadataDetails.size());
// First drop the partitions from partition mapper files of each segment
for (LoadMetadataDetails loadMetadataDetail : validLoadMetadataDetails) {
new SegmentFileStore(table.getTablePath(), loadMetadataDetail.getSegmentFile()).dropPartitions(loadMetadataDetail.getLoadName(), partitionList, uniqueId, toBeDeletedSegments, toBeUpdatedSegments);
}
newMetaEntry.setUpdateStatusFileName(uniqueId);
// Commit the removed partitions in carbon store.
CarbonLoaderUtil.recordNewLoadMetadata(newMetaEntry, loadModel, false, false, uuid, Segment.toSegmentList(toBeDeletedSegments, null), Segment.toSegmentList(toBeUpdatedSegments, null), false);
return uniqueId;
} else {
CarbonLoaderUtil.recordNewLoadMetadata(newMetaEntry, loadModel, false, false, uuid, false);
return null;
}
}
return null;
}
use of org.apache.carbondata.core.index.Segment in project carbondata by apache.
the class CarbonTableInputFormat method updateLoadMetaDataDetailsToSegments.
public void updateLoadMetaDataDetailsToSegments(List<Segment> validSegments, List<org.apache.carbondata.hadoop.CarbonInputSplit> prunedSplits) {
Map<String, Segment> validSegmentsMap = validSegments.stream().collect(Collectors.toMap(Segment::getSegmentNo, segment -> segment, (e1, e2) -> e1));
for (CarbonInputSplit split : prunedSplits) {
Segment segment = split.getSegment();
if (segment.getLoadMetadataDetails() == null || segment.getReadCommittedScope() == null) {
if (validSegmentsMap.containsKey(segment.getSegmentNo())) {
segment.setLoadMetadataDetails(validSegmentsMap.get(segment.getSegmentNo()).getLoadMetadataDetails());
segment.setReadCommittedScope(validSegmentsMap.get(segment.getSegmentNo()).getReadCommittedScope());
}
}
}
}
use of org.apache.carbondata.core.index.Segment in project carbondata by apache.
the class CarbonInputFormat method getPrunedBlocklets.
/**
* Prune the blocklets using the filter expression with available index.
* First pruned with default blocklet index, then pruned with CG and FG index
*/
public List<ExtendedBlocklet> getPrunedBlocklets(JobContext job, CarbonTable carbonTable, IndexFilter filter, List<Segment> validSegments, List<Segment> invalidSegments, List<String> segmentsToBeRefreshed) throws IOException {
ExplainCollector.addPruningInfo(carbonTable.getTableName());
filter = filter == null ? new IndexFilter(carbonTable, null) : filter;
ExplainCollector.setFilterStatement(filter.getExpression() == null ? "none" : filter.getExpression().getStatement());
boolean distributedCG = Boolean.parseBoolean(CarbonProperties.getInstance().getProperty(CarbonCommonConstants.USE_DISTRIBUTED_INDEX, CarbonCommonConstants.USE_DISTRIBUTED_INDEX_DEFAULT));
IndexJob indexJob = IndexUtil.getIndexJob(job.getConfiguration());
List<PartitionSpec> partitionsToPrune = getPartitionsToPrune(job.getConfiguration());
// First prune using default index on driver side.
TableIndex defaultIndex = IndexStoreManager.getInstance().getDefaultIndex(carbonTable);
List<ExtendedBlocklet> prunedBlocklets;
// This is to log the event, so user will know what is happening by seeing logs.
LOG.info("Started block pruning ...");
boolean isDistributedPruningEnabled = CarbonProperties.getInstance().isDistributedPruningEnabled(carbonTable.getDatabaseName(), carbonTable.getTableName());
boolean isIndexServerContext = job.getConfiguration().get("isIndexServerContext", "false").equals("true");
if (isDistributedPruningEnabled && !isIndexServerContext) {
try {
prunedBlocklets = getDistributedSplit(carbonTable, filter.getResolver(), partitionsToPrune, validSegments, invalidSegments, segmentsToBeRefreshed, false, job.getConfiguration(), filter.getMissingSISegments());
} catch (Exception e) {
// pruning.
if (CarbonProperties.getInstance().isFallBackDisabled()) {
throw e;
}
prunedBlocklets = defaultIndex.prune(validSegments, filter, partitionsToPrune);
}
} else {
if (carbonTable.isTransactionalTable()) {
IndexExprWrapper indexExprWrapper = IndexChooser.getDefaultIndex(getOrCreateCarbonTable(job.getConfiguration()), null);
IndexUtil.loadIndexes(carbonTable, indexExprWrapper, validSegments);
}
prunedBlocklets = defaultIndex.prune(validSegments, filter, partitionsToPrune);
if (ExplainCollector.enabled()) {
ExplainCollector.setDefaultIndexPruningBlockHit(getBlockCount(prunedBlocklets));
}
if (prunedBlocklets.size() == 0) {
return prunedBlocklets;
}
IndexChooser chooser = new IndexChooser(getOrCreateCarbonTable(job.getConfiguration()), isSecondaryIndexPruningEnabled(job.getConfiguration()));
// Get the available CG indexes and prune further.
IndexExprWrapper cgIndexExprWrapper = chooser.chooseCGIndex(filter.getResolver());
if (cgIndexExprWrapper != null) {
// Prune segments from already pruned blocklets
IndexUtil.pruneSegments(validSegments, prunedBlocklets);
List<ExtendedBlocklet> cgPrunedBlocklets = new ArrayList<>();
// If SI present in cgIndexExprWrapper then set the list of
// blocklet in segment which are pruned by default index,
// and this list will be return from SI prune method if segment is not present in SI.
Map<String, List<ExtendedBlocklet>> segmentsToBlocklet = new HashMap<>();
for (ExtendedBlocklet extendedBlocklet : prunedBlocklets) {
List<ExtendedBlocklet> extendedBlockletList = segmentsToBlocklet.getOrDefault(extendedBlocklet.getSegmentId(), new ArrayList<>());
extendedBlockletList.add(extendedBlocklet);
segmentsToBlocklet.put(extendedBlocklet.getSegmentId(), extendedBlockletList);
}
for (Segment seg : validSegments) {
seg.setDefaultIndexPrunedBlocklets(segmentsToBlocklet.get(seg.getSegmentNo()));
}
boolean isCGPruneFallback = false;
// Again prune with CG index.
try {
if (distributedCG && indexJob != null) {
cgPrunedBlocklets = IndexUtil.executeIndexJob(carbonTable, filter.getResolver(), indexJob, partitionsToPrune, validSegments, invalidSegments, IndexLevel.CG, new ArrayList<>(), job.getConfiguration());
} else {
cgPrunedBlocklets = cgIndexExprWrapper.prune(validSegments, partitionsToPrune);
}
} catch (Exception e) {
isCGPruneFallback = true;
LOG.error("CG index pruning failed.", e);
}
// hence no need to do intersect and simply pass the prunedBlocklets from default index
if (!isCGPruneFallback) {
if (isIndexServerContext) {
// For all blocklets initialize the detail info so that it can be serialized to driver
for (ExtendedBlocklet blocklet : cgPrunedBlocklets) {
blocklet.getDetailInfo();
blocklet.setCgIndexPresent(true);
}
}
// since index index prune in segment scope,
// the result need to intersect with previous pruned result
prunedBlocklets = intersectFilteredBlocklets(carbonTable, prunedBlocklets, cgPrunedBlocklets);
}
if (ExplainCollector.enabled()) {
ExplainCollector.recordCGIndexPruning(IndexWrapperSimpleInfo.fromIndexWrapper(cgIndexExprWrapper), prunedBlocklets.size(), getBlockCount(prunedBlocklets));
}
}
if (prunedBlocklets.size() == 0) {
return prunedBlocklets;
}
// Now try to prune with FG Index.
if (isFgIndexPruningEnable(job.getConfiguration()) && indexJob != null) {
IndexExprWrapper fgIndexExprWrapper = chooser.chooseFGIndex(filter.getResolver());
List<ExtendedBlocklet> fgPrunedBlocklets;
if (fgIndexExprWrapper != null) {
// Prune segments from already pruned blocklets
IndexUtil.pruneSegments(validSegments, prunedBlocklets);
// Prune segments from already pruned blocklets
fgPrunedBlocklets = IndexUtil.executeIndexJob(carbonTable, filter.getResolver(), indexJob, partitionsToPrune, validSegments, invalidSegments, fgIndexExprWrapper.getIndexLevel(), new ArrayList<>(), job.getConfiguration());
// note that the 'fgPrunedBlocklets' has extra index related info compared with
// 'prunedBlocklets', so the intersection should keep the elements in 'fgPrunedBlocklets'
prunedBlocklets = intersectFilteredBlocklets(carbonTable, prunedBlocklets, fgPrunedBlocklets);
ExplainCollector.recordFGIndexPruning(IndexWrapperSimpleInfo.fromIndexWrapper(fgIndexExprWrapper), prunedBlocklets.size(), getBlockCount(prunedBlocklets));
}
}
}
LOG.info("Finished block pruning ...");
return prunedBlocklets;
}
Aggregations