use of org.apache.carbondata.core.datamap.Segment in project carbondata by apache.
the class CarbonTableInputFormat method getFilteredSegment.
/**
* Return segment list after filtering out valid segments and segments set by user by
* `INPUT_SEGMENT_NUMBERS` in job configuration
*/
private List<Segment> getFilteredSegment(JobContext job, List<Segment> validSegments, boolean validationRequired) {
Segment[] segmentsToAccess = getSegmentsToAccess(job);
List<Segment> segmentToAccessSet = new ArrayList<>(new HashSet<>(Arrays.asList(segmentsToAccess)));
List<Segment> filteredSegmentToAccess = new ArrayList<>();
if (segmentsToAccess.length == 0 || segmentsToAccess[0].getSegmentNo().equalsIgnoreCase("*")) {
filteredSegmentToAccess.addAll(validSegments);
} else {
for (Segment validSegment : validSegments) {
int index = segmentToAccessSet.indexOf(validSegment);
if (index > -1) {
// In case of in progress reading segment, segment file name is set to the property itself
if (segmentToAccessSet.get(index).getSegmentFileName() != null && validSegment.getSegmentFileName() == null) {
filteredSegmentToAccess.add(segmentToAccessSet.get(index));
} else {
filteredSegmentToAccess.add(validSegment);
}
}
}
if (filteredSegmentToAccess.size() != segmentToAccessSet.size() && !validationRequired) {
for (Segment segment : segmentToAccessSet) {
if (!filteredSegmentToAccess.contains(segment)) {
filteredSegmentToAccess.add(segment);
}
}
}
if (!filteredSegmentToAccess.containsAll(segmentToAccessSet)) {
List<Segment> filteredSegmentToAccessTemp = new ArrayList<>(filteredSegmentToAccess);
filteredSegmentToAccessTemp.removeAll(segmentToAccessSet);
LOG.info("Segments ignored are : " + Arrays.toString(filteredSegmentToAccessTemp.toArray()));
}
}
return filteredSegmentToAccess;
}
use of org.apache.carbondata.core.datamap.Segment in project carbondata by apache.
the class CarbonTableInputFormat method getSplits.
/**
* {@inheritDoc}
* Configurations FileInputFormat.INPUT_DIR
* are used to get table path to read.
*
* @param job
* @return List<InputSplit> list of CarbonInputSplit
* @throws IOException
*/
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
AbsoluteTableIdentifier identifier = getAbsoluteTableIdentifier(job.getConfiguration());
LoadMetadataDetails[] loadMetadataDetails = SegmentStatusManager.readTableStatusFile(CarbonTablePath.getTableStatusFilePath(identifier.getTablePath()));
CarbonTable carbonTable = getOrCreateCarbonTable(job.getConfiguration());
if (null == carbonTable) {
throw new IOException("Missing/Corrupt schema file for table.");
}
SegmentUpdateStatusManager updateStatusManager = new SegmentUpdateStatusManager(carbonTable, loadMetadataDetails);
List<Segment> invalidSegments = new ArrayList<>();
List<UpdateVO> invalidTimestampsList = new ArrayList<>();
List<Segment> streamSegments = null;
// get all valid segments and set them into the configuration
SegmentStatusManager segmentStatusManager = new SegmentStatusManager(identifier);
SegmentStatusManager.ValidAndInvalidSegmentsInfo segments = segmentStatusManager.getValidAndInvalidSegments(loadMetadataDetails);
if (getValidateSegmentsToAccess(job.getConfiguration())) {
List<Segment> validSegments = segments.getValidSegments();
streamSegments = segments.getStreamSegments();
streamSegments = getFilteredSegment(job, streamSegments, true);
if (validSegments.size() == 0) {
return getSplitsOfStreaming(job, identifier, streamSegments);
}
List<Segment> filteredSegmentToAccess = getFilteredSegment(job, segments.getValidSegments(), true);
if (filteredSegmentToAccess.size() == 0) {
return getSplitsOfStreaming(job, identifier, streamSegments);
} else {
setSegmentsToAccess(job.getConfiguration(), filteredSegmentToAccess);
}
// remove entry in the segment index if there are invalid segments
invalidSegments.addAll(segments.getInvalidSegments());
for (Segment invalidSegmentId : invalidSegments) {
invalidTimestampsList.add(updateStatusManager.getInvalidTimestampRange(invalidSegmentId.getSegmentNo()));
}
if (invalidSegments.size() > 0) {
DataMapStoreManager.getInstance().clearInvalidSegments(getOrCreateCarbonTable(job.getConfiguration()), invalidSegments);
}
}
ArrayList<Segment> validAndInProgressSegments = new ArrayList<>(segments.getValidSegments());
// Add in progress segments also to filter it as in case of aggregate table load it loads
// data from in progress table.
validAndInProgressSegments.addAll(segments.getListOfInProgressSegments());
// get updated filtered list
List<Segment> filteredSegmentToAccess = getFilteredSegment(job, new ArrayList<>(validAndInProgressSegments), false);
// Clean the updated segments from memory if the update happens on segments
List<Segment> toBeCleanedSegments = new ArrayList<>();
for (SegmentUpdateDetails segmentUpdateDetail : updateStatusManager.getUpdateStatusDetails()) {
boolean refreshNeeded = DataMapStoreManager.getInstance().getTableSegmentRefresher(carbonTable).isRefreshNeeded(segmentUpdateDetail.getSegmentName(), updateStatusManager);
if (refreshNeeded) {
toBeCleanedSegments.add(new Segment(segmentUpdateDetail.getSegmentName(), null));
}
}
// Clean segments if refresh is needed
for (Segment segment : filteredSegmentToAccess) {
if (DataMapStoreManager.getInstance().getTableSegmentRefresher(carbonTable).isRefreshNeeded(segment.getSegmentNo())) {
toBeCleanedSegments.add(segment);
}
}
if (toBeCleanedSegments.size() > 0) {
DataMapStoreManager.getInstance().clearInvalidSegments(getOrCreateCarbonTable(job.getConfiguration()), toBeCleanedSegments);
}
// process and resolve the expression
Expression filter = getFilterPredicates(job.getConfiguration());
TableProvider tableProvider = new SingleTableProvider(carbonTable);
// this will be null in case of corrupt schema file.
PartitionInfo partitionInfo = carbonTable.getPartitionInfo(carbonTable.getTableName());
carbonTable.processFilterExpression(filter, null, null);
// prune partitions for filter query on partition table
BitSet matchedPartitions = null;
if (partitionInfo != null && partitionInfo.getPartitionType() != PartitionType.NATIVE_HIVE) {
matchedPartitions = setMatchedPartitions(null, filter, partitionInfo, null);
if (matchedPartitions != null) {
if (matchedPartitions.cardinality() == 0) {
return new ArrayList<InputSplit>();
} else if (matchedPartitions.cardinality() == partitionInfo.getNumPartitions()) {
matchedPartitions = null;
}
}
}
FilterResolverIntf filterInterface = carbonTable.resolveFilter(filter, tableProvider);
// do block filtering and get split
List<InputSplit> splits = getSplits(job, filterInterface, filteredSegmentToAccess, matchedPartitions, partitionInfo, null, updateStatusManager);
// pass the invalid segment to task side in order to remove index entry in task side
if (invalidSegments.size() > 0) {
for (InputSplit split : splits) {
((org.apache.carbondata.hadoop.CarbonInputSplit) split).setInvalidSegments(invalidSegments);
((org.apache.carbondata.hadoop.CarbonInputSplit) split).setInvalidTimestampRange(invalidTimestampsList);
}
}
// add all splits of streaming
List<InputSplit> splitsOfStreaming = getSplitsOfStreaming(job, identifier, streamSegments);
if (!splitsOfStreaming.isEmpty()) {
splits.addAll(splitsOfStreaming);
}
return splits;
}
use of org.apache.carbondata.core.datamap.Segment in project carbondata by apache.
the class CarbonTableInputFormat method getSplitsOfOneSegment.
/**
* Read data in one segment. For alter table partition statement
* @param job
* @param targetSegment
* @param oldPartitionIdList get old partitionId before partitionInfo was changed
* @return
*/
public List<InputSplit> getSplitsOfOneSegment(JobContext job, String targetSegment, List<Integer> oldPartitionIdList, PartitionInfo partitionInfo) {
List<Segment> invalidSegments = new ArrayList<>();
List<UpdateVO> invalidTimestampsList = new ArrayList<>();
List<Segment> segmentList = new ArrayList<>();
segmentList.add(new Segment(targetSegment, null));
setSegmentsToAccess(job.getConfiguration(), segmentList);
try {
// process and resolve the expression
Expression filter = getFilterPredicates(job.getConfiguration());
CarbonTable carbonTable = getOrCreateCarbonTable(job.getConfiguration());
// this will be null in case of corrupt schema file.
if (null == carbonTable) {
throw new IOException("Missing/Corrupt schema file for table.");
}
carbonTable.processFilterExpression(filter, null, null);
TableProvider tableProvider = new SingleTableProvider(carbonTable);
// prune partitions for filter query on partition table
String partitionIds = job.getConfiguration().get(ALTER_PARTITION_ID);
// matchedPartitions records partitionIndex, not partitionId
BitSet matchedPartitions = null;
if (partitionInfo != null) {
matchedPartitions = setMatchedPartitions(partitionIds, filter, partitionInfo, oldPartitionIdList);
if (matchedPartitions != null) {
if (matchedPartitions.cardinality() == 0) {
return new ArrayList<InputSplit>();
} else if (matchedPartitions.cardinality() == partitionInfo.getNumPartitions()) {
matchedPartitions = null;
}
}
}
FilterResolverIntf filterInterface = carbonTable.resolveFilter(filter, tableProvider);
// do block filtering and get split
List<InputSplit> splits = getSplits(job, filterInterface, segmentList, matchedPartitions, partitionInfo, oldPartitionIdList, new SegmentUpdateStatusManager(carbonTable));
// pass the invalid segment to task side in order to remove index entry in task side
if (invalidSegments.size() > 0) {
for (InputSplit split : splits) {
((CarbonInputSplit) split).setInvalidSegments(invalidSegments);
((CarbonInputSplit) split).setInvalidTimestampRange(invalidTimestampsList);
}
}
return splits;
} catch (IOException e) {
throw new RuntimeException("Can't get splits of the target segment ", e);
}
}
Aggregations